xref: /linux/drivers/gpu/drm/xe/xe_vm.c (revision 5bfb7e6a7fc0056a974ce13a81c95602a2cae859)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_vm.h"
7 
8 #include <linux/dma-fence-array.h>
9 #include <linux/nospec.h>
10 
11 #include <drm/drm_drv.h>
12 #include <drm/drm_exec.h>
13 #include <drm/drm_print.h>
14 #include <drm/ttm/ttm_tt.h>
15 #include <uapi/drm/xe_drm.h>
16 #include <linux/ascii85.h>
17 #include <linux/delay.h>
18 #include <linux/kthread.h>
19 #include <linux/mm.h>
20 #include <linux/swap.h>
21 
22 #include <generated/xe_wa_oob.h>
23 
24 #include "regs/xe_gtt_defs.h"
25 #include "xe_assert.h"
26 #include "xe_bo.h"
27 #include "xe_device.h"
28 #include "xe_drm_client.h"
29 #include "xe_exec_queue.h"
30 #include "xe_gt.h"
31 #include "xe_migrate.h"
32 #include "xe_pat.h"
33 #include "xe_pm.h"
34 #include "xe_preempt_fence.h"
35 #include "xe_pt.h"
36 #include "xe_pxp.h"
37 #include "xe_sriov_vf.h"
38 #include "xe_svm.h"
39 #include "xe_sync.h"
40 #include "xe_tile.h"
41 #include "xe_tlb_inval.h"
42 #include "xe_trace_bo.h"
43 #include "xe_vm_madvise.h"
44 #include "xe_wa.h"
45 
46 static struct drm_gem_object *xe_vm_obj(struct xe_vm *vm)
47 {
48 	return vm->gpuvm.r_obj;
49 }
50 
51 /**
52  * xe_vm_drm_exec_lock() - Lock the vm's resv with a drm_exec transaction
53  * @vm: The vm whose resv is to be locked.
54  * @exec: The drm_exec transaction.
55  *
56  * Helper to lock the vm's resv as part of a drm_exec transaction.
57  *
58  * Return: %0 on success. See drm_exec_lock_obj() for error codes.
59  */
60 int xe_vm_drm_exec_lock(struct xe_vm *vm, struct drm_exec *exec)
61 {
62 	return drm_exec_lock_obj(exec, xe_vm_obj(vm));
63 }
64 
65 static bool preempt_fences_waiting(struct xe_vm *vm)
66 {
67 	struct xe_exec_queue *q;
68 
69 	lockdep_assert_held(&vm->lock);
70 	xe_vm_assert_held(vm);
71 
72 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
73 		if (!q->lr.pfence ||
74 		    test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
75 			     &q->lr.pfence->flags)) {
76 			return true;
77 		}
78 	}
79 
80 	return false;
81 }
82 
83 static void free_preempt_fences(struct list_head *list)
84 {
85 	struct list_head *link, *next;
86 
87 	list_for_each_safe(link, next, list)
88 		xe_preempt_fence_free(to_preempt_fence_from_link(link));
89 }
90 
91 static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
92 				unsigned int *count)
93 {
94 	lockdep_assert_held(&vm->lock);
95 	xe_vm_assert_held(vm);
96 
97 	if (*count >= vm->preempt.num_exec_queues)
98 		return 0;
99 
100 	for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
101 		struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
102 
103 		if (IS_ERR(pfence))
104 			return PTR_ERR(pfence);
105 
106 		list_move_tail(xe_preempt_fence_link(pfence), list);
107 	}
108 
109 	return 0;
110 }
111 
112 static int wait_for_existing_preempt_fences(struct xe_vm *vm)
113 {
114 	struct xe_exec_queue *q;
115 	bool vf_migration = IS_SRIOV_VF(vm->xe) &&
116 		xe_sriov_vf_migration_supported(vm->xe);
117 	signed long wait_time = vf_migration ? HZ / 5 : MAX_SCHEDULE_TIMEOUT;
118 
119 	xe_vm_assert_held(vm);
120 
121 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
122 		if (q->lr.pfence) {
123 			long timeout;
124 
125 			timeout = dma_fence_wait_timeout(q->lr.pfence, false,
126 							 wait_time);
127 			if (!timeout) {
128 				xe_assert(vm->xe, vf_migration);
129 				return -EAGAIN;
130 			}
131 
132 			/* Only -ETIME on fence indicates VM needs to be killed */
133 			if (timeout < 0 || q->lr.pfence->error == -ETIME)
134 				return -ETIME;
135 
136 			dma_fence_put(q->lr.pfence);
137 			q->lr.pfence = NULL;
138 		}
139 	}
140 
141 	return 0;
142 }
143 
144 static bool xe_vm_is_idle(struct xe_vm *vm)
145 {
146 	struct xe_exec_queue *q;
147 
148 	xe_vm_assert_held(vm);
149 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
150 		if (!xe_exec_queue_is_idle(q))
151 			return false;
152 	}
153 
154 	return true;
155 }
156 
157 static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
158 {
159 	struct list_head *link;
160 	struct xe_exec_queue *q;
161 
162 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
163 		struct dma_fence *fence;
164 
165 		link = list->next;
166 		xe_assert(vm->xe, link != list);
167 
168 		fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
169 					     q, q->lr.context,
170 					     ++q->lr.seqno);
171 		dma_fence_put(q->lr.pfence);
172 		q->lr.pfence = fence;
173 	}
174 }
175 
176 static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
177 {
178 	struct xe_exec_queue *q;
179 	int err;
180 
181 	xe_bo_assert_held(bo);
182 
183 	if (!vm->preempt.num_exec_queues)
184 		return 0;
185 
186 	err = dma_resv_reserve_fences(bo->ttm.base.resv, vm->preempt.num_exec_queues);
187 	if (err)
188 		return err;
189 
190 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
191 		if (q->lr.pfence) {
192 			dma_resv_add_fence(bo->ttm.base.resv,
193 					   q->lr.pfence,
194 					   DMA_RESV_USAGE_BOOKKEEP);
195 		}
196 
197 	return 0;
198 }
199 
200 static void resume_and_reinstall_preempt_fences(struct xe_vm *vm,
201 						struct drm_exec *exec)
202 {
203 	struct xe_exec_queue *q;
204 
205 	lockdep_assert_held(&vm->lock);
206 	xe_vm_assert_held(vm);
207 
208 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
209 		q->ops->resume(q);
210 
211 		drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, q->lr.pfence,
212 					 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
213 	}
214 }
215 
216 int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
217 {
218 	struct drm_gpuvm_exec vm_exec = {
219 		.vm = &vm->gpuvm,
220 		.flags = DRM_EXEC_INTERRUPTIBLE_WAIT,
221 		.num_fences = 1,
222 	};
223 	struct drm_exec *exec = &vm_exec.exec;
224 	struct xe_validation_ctx ctx;
225 	struct dma_fence *pfence;
226 	int err;
227 	bool wait;
228 
229 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
230 
231 	down_write(&vm->lock);
232 	err = xe_validation_exec_lock(&ctx, &vm_exec, &vm->xe->val);
233 	if (err)
234 		goto out_up_write;
235 
236 	pfence = xe_preempt_fence_create(q, q->lr.context,
237 					 ++q->lr.seqno);
238 	if (IS_ERR(pfence)) {
239 		err = PTR_ERR(pfence);
240 		goto out_fini;
241 	}
242 
243 	list_add(&q->lr.link, &vm->preempt.exec_queues);
244 	++vm->preempt.num_exec_queues;
245 	q->lr.pfence = pfence;
246 
247 	xe_svm_notifier_lock(vm);
248 
249 	drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, pfence,
250 				 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
251 
252 	/*
253 	 * Check to see if a preemption on VM is in flight or userptr
254 	 * invalidation, if so trigger this preempt fence to sync state with
255 	 * other preempt fences on the VM.
256 	 */
257 	wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
258 	if (wait)
259 		dma_fence_enable_sw_signaling(pfence);
260 
261 	xe_svm_notifier_unlock(vm);
262 
263 out_fini:
264 	xe_validation_ctx_fini(&ctx);
265 out_up_write:
266 	up_write(&vm->lock);
267 
268 	return err;
269 }
270 ALLOW_ERROR_INJECTION(xe_vm_add_compute_exec_queue, ERRNO);
271 
272 /**
273  * xe_vm_remove_compute_exec_queue() - Remove compute exec queue from VM
274  * @vm: The VM.
275  * @q: The exec_queue
276  *
277  * Note that this function might be called multiple times on the same queue.
278  */
279 void xe_vm_remove_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
280 {
281 	if (!xe_vm_in_preempt_fence_mode(vm))
282 		return;
283 
284 	down_write(&vm->lock);
285 	if (!list_empty(&q->lr.link)) {
286 		list_del_init(&q->lr.link);
287 		--vm->preempt.num_exec_queues;
288 	}
289 	if (q->lr.pfence) {
290 		dma_fence_enable_sw_signaling(q->lr.pfence);
291 		dma_fence_put(q->lr.pfence);
292 		q->lr.pfence = NULL;
293 	}
294 	up_write(&vm->lock);
295 }
296 
297 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
298 
299 /**
300  * xe_vm_kill() - VM Kill
301  * @vm: The VM.
302  * @unlocked: Flag indicates the VM's dma-resv is not held
303  *
304  * Kill the VM by setting banned flag indicated VM is no longer available for
305  * use. If in preempt fence mode, also kill all exec queue attached to the VM.
306  */
307 void xe_vm_kill(struct xe_vm *vm, bool unlocked)
308 {
309 	struct xe_exec_queue *q;
310 
311 	lockdep_assert_held(&vm->lock);
312 
313 	if (unlocked)
314 		xe_vm_lock(vm, false);
315 
316 	vm->flags |= XE_VM_FLAG_BANNED;
317 	trace_xe_vm_kill(vm);
318 
319 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
320 		q->ops->kill(q);
321 
322 	if (unlocked)
323 		xe_vm_unlock(vm);
324 
325 	/* TODO: Inform user the VM is banned */
326 }
327 
328 static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
329 {
330 	struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
331 	struct xe_bo *bo = gem_to_xe_bo(vm_bo->obj);
332 	struct drm_gpuva *gpuva;
333 	int ret;
334 
335 	lockdep_assert_held(&vm->lock);
336 	drm_gpuvm_bo_for_each_va(gpuva, vm_bo)
337 		list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
338 			       &vm->rebind_list);
339 
340 	/* Skip re-populating purged BOs, rebind maps scratch pages. */
341 	if (xe_bo_is_purged(bo)) {
342 		vm_bo->evicted = false;
343 		return 0;
344 	}
345 
346 	if (!try_wait_for_completion(&vm->xe->pm_block))
347 		return -EAGAIN;
348 
349 	ret = xe_bo_validate(bo, vm, false, exec);
350 	if (ret)
351 		return ret;
352 
353 	vm_bo->evicted = false;
354 	return 0;
355 }
356 
357 /**
358  * xe_vm_validate_rebind() - Validate buffer objects and rebind vmas
359  * @vm: The vm for which we are rebinding.
360  * @exec: The struct drm_exec with the locked GEM objects.
361  * @num_fences: The number of fences to reserve for the operation, not
362  * including rebinds and validations.
363  *
364  * Validates all evicted gem objects and rebinds their vmas. Note that
365  * rebindings may cause evictions and hence the validation-rebind
366  * sequence is rerun until there are no more objects to validate.
367  *
368  * Return: 0 on success, negative error code on error. In particular,
369  * may return -EINTR or -ERESTARTSYS if interrupted, and -EDEADLK if
370  * the drm_exec transaction needs to be restarted.
371  */
372 int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,
373 			  unsigned int num_fences)
374 {
375 	struct drm_gem_object *obj;
376 	unsigned long index;
377 	int ret;
378 
379 	do {
380 		ret = drm_gpuvm_validate(&vm->gpuvm, exec);
381 		if (ret)
382 			return ret;
383 
384 		ret = xe_vm_rebind(vm, false);
385 		if (ret)
386 			return ret;
387 	} while (!list_empty(&vm->gpuvm.evict.list));
388 
389 	drm_exec_for_each_locked_object(exec, index, obj) {
390 		ret = dma_resv_reserve_fences(obj->resv, num_fences);
391 		if (ret)
392 			return ret;
393 	}
394 
395 	return 0;
396 }
397 
398 static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
399 				 bool *done)
400 {
401 	int err;
402 
403 	err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, 0);
404 	if (err)
405 		return err;
406 
407 	if (xe_vm_is_idle(vm)) {
408 		vm->preempt.rebind_deactivated = true;
409 		*done = true;
410 		return 0;
411 	}
412 
413 	if (!preempt_fences_waiting(vm)) {
414 		*done = true;
415 		return 0;
416 	}
417 
418 	err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, 0);
419 	if (err)
420 		return err;
421 
422 	err = wait_for_existing_preempt_fences(vm);
423 	if (err)
424 		return err;
425 
426 	/*
427 	 * Add validation and rebinding to the locking loop since both can
428 	 * cause evictions which may require blocing dma_resv locks.
429 	 * The fence reservation here is intended for the new preempt fences
430 	 * we attach at the end of the rebind work.
431 	 */
432 	return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues);
433 }
434 
435 static bool vm_suspend_rebind_worker(struct xe_vm *vm)
436 {
437 	struct xe_device *xe = vm->xe;
438 	bool ret = false;
439 
440 	mutex_lock(&xe->rebind_resume_lock);
441 	if (!try_wait_for_completion(&vm->xe->pm_block)) {
442 		ret = true;
443 		list_move_tail(&vm->preempt.pm_activate_link, &xe->rebind_resume_list);
444 	}
445 	mutex_unlock(&xe->rebind_resume_lock);
446 
447 	return ret;
448 }
449 
450 /**
451  * xe_vm_resume_rebind_worker() - Resume the rebind worker.
452  * @vm: The vm whose preempt worker to resume.
453  *
454  * Resume a preempt worker that was previously suspended by
455  * vm_suspend_rebind_worker().
456  */
457 void xe_vm_resume_rebind_worker(struct xe_vm *vm)
458 {
459 	queue_work(vm->xe->ordered_wq, &vm->preempt.rebind_work);
460 }
461 
462 static void preempt_rebind_work_func(struct work_struct *w)
463 {
464 	struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
465 	struct xe_validation_ctx ctx;
466 	struct drm_exec exec;
467 	unsigned int fence_count = 0;
468 	LIST_HEAD(preempt_fences);
469 	int err = 0;
470 	long wait;
471 	int __maybe_unused tries = 0;
472 
473 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
474 	trace_xe_vm_rebind_worker_enter(vm);
475 
476 	down_write(&vm->lock);
477 
478 	if (xe_vm_is_closed_or_banned(vm)) {
479 		up_write(&vm->lock);
480 		trace_xe_vm_rebind_worker_exit(vm);
481 		return;
482 	}
483 
484 retry:
485 	if (!try_wait_for_completion(&vm->xe->pm_block) && vm_suspend_rebind_worker(vm)) {
486 		up_write(&vm->lock);
487 		/* We don't actually block but don't make progress. */
488 		xe_pm_might_block_on_suspend();
489 		return;
490 	}
491 
492 	if (xe_vm_userptr_check_repin(vm)) {
493 		err = xe_vm_userptr_pin(vm);
494 		if (err)
495 			goto out_unlock_outer;
496 	}
497 
498 	err = xe_validation_ctx_init(&ctx, &vm->xe->val, &exec,
499 				     (struct xe_val_flags) {.interruptible = true});
500 	if (err)
501 		goto out_unlock_outer;
502 
503 	drm_exec_until_all_locked(&exec) {
504 		bool done = false;
505 
506 		err = xe_preempt_work_begin(&exec, vm, &done);
507 		drm_exec_retry_on_contention(&exec);
508 		xe_validation_retry_on_oom(&ctx, &err);
509 		if (err || done) {
510 			xe_validation_ctx_fini(&ctx);
511 			goto out_unlock_outer;
512 		}
513 	}
514 
515 	err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
516 	if (err)
517 		goto out_unlock;
518 
519 	xe_vm_set_validation_exec(vm, &exec);
520 	err = xe_vm_rebind(vm, true);
521 	xe_vm_set_validation_exec(vm, NULL);
522 	if (err)
523 		goto out_unlock;
524 
525 	/* Wait on rebinds and munmap style VM unbinds */
526 	wait = dma_resv_wait_timeout(xe_vm_resv(vm),
527 				     DMA_RESV_USAGE_KERNEL,
528 				     false, MAX_SCHEDULE_TIMEOUT);
529 	if (wait <= 0) {
530 		err = -ETIME;
531 		goto out_unlock;
532 	}
533 
534 #define retry_required(__tries, __vm) \
535 	(IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
536 	(!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
537 	__xe_vm_userptr_needs_repin(__vm))
538 
539 	xe_svm_notifier_lock(vm);
540 	if (retry_required(tries, vm)) {
541 		xe_svm_notifier_unlock(vm);
542 		err = -EAGAIN;
543 		goto out_unlock;
544 	}
545 
546 #undef retry_required
547 
548 	spin_lock(&vm->xe->ttm.lru_lock);
549 	ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
550 	spin_unlock(&vm->xe->ttm.lru_lock);
551 
552 	/* Point of no return. */
553 	arm_preempt_fences(vm, &preempt_fences);
554 	resume_and_reinstall_preempt_fences(vm, &exec);
555 	xe_svm_notifier_unlock(vm);
556 
557 out_unlock:
558 	xe_validation_ctx_fini(&ctx);
559 out_unlock_outer:
560 	if (err == -EAGAIN) {
561 		trace_xe_vm_rebind_worker_retry(vm);
562 
563 		/*
564 		 * We can't block in workers on a VF which supports migration
565 		 * given this can block the VF post-migration workers from
566 		 * getting scheduled.
567 		 */
568 		if (IS_SRIOV_VF(vm->xe) &&
569 		    xe_sriov_vf_migration_supported(vm->xe)) {
570 			up_write(&vm->lock);
571 			xe_vm_queue_rebind_worker(vm);
572 			return;
573 		}
574 
575 		goto retry;
576 	}
577 
578 	if (err) {
579 		drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
580 		xe_vm_kill(vm, true);
581 	}
582 	up_write(&vm->lock);
583 
584 	free_preempt_fences(&preempt_fences);
585 
586 	trace_xe_vm_rebind_worker_exit(vm);
587 }
588 
589 /**
590  * xe_vm_add_fault_entry_pf() - Add pagefault to vm fault list
591  * @vm: The VM.
592  * @pf: The pagefault.
593  *
594  * This function takes the data from the pagefault @pf and saves it to @vm->faults.list.
595  *
596  * The function exits silently if the list is full, and reports a warning if the pagefault
597  * could not be saved to the list.
598  */
599 void xe_vm_add_fault_entry_pf(struct xe_vm *vm, struct xe_pagefault *pf)
600 {
601 	struct xe_vm_fault_entry *e;
602 	struct xe_hw_engine *hwe;
603 
604 	/* Do not report faults on reserved engines */
605 	hwe = xe_gt_hw_engine(pf->gt, pf->consumer.engine_class,
606 			      pf->consumer.engine_instance, false);
607 	if (!hwe || xe_hw_engine_is_reserved(hwe))
608 		return;
609 
610 	e = kzalloc_obj(*e);
611 	if (!e) {
612 		drm_warn(&vm->xe->drm,
613 			 "Could not allocate memory for fault!\n");
614 		return;
615 	}
616 
617 	guard(spinlock)(&vm->faults.lock);
618 
619 	/*
620 	 * Limit the number of faults in the fault list to prevent
621 	 * memory overuse.
622 	 */
623 	if (vm->faults.len >= MAX_FAULTS_SAVED_PER_VM) {
624 		kfree(e);
625 		return;
626 	}
627 
628 	e->address = pf->consumer.page_addr;
629 	/*
630 	 * TODO:
631 	 * Address precision is currently always SZ_4K, but this may change
632 	 * in the future.
633 	 */
634 	e->address_precision = SZ_4K;
635 	e->access_type = pf->consumer.access_type;
636 	e->fault_type = FIELD_GET(XE_PAGEFAULT_TYPE_MASK,
637 				  pf->consumer.fault_type_level),
638 	e->fault_level = FIELD_GET(XE_PAGEFAULT_LEVEL_MASK,
639 				   pf->consumer.fault_type_level),
640 
641 	list_add_tail(&e->list, &vm->faults.list);
642 	vm->faults.len++;
643 }
644 
645 static void xe_vm_clear_fault_entries(struct xe_vm *vm)
646 {
647 	struct xe_vm_fault_entry *e, *tmp;
648 
649 	guard(spinlock)(&vm->faults.lock);
650 	list_for_each_entry_safe(e, tmp, &vm->faults.list, list) {
651 		list_del(&e->list);
652 		kfree(e);
653 	}
654 	vm->faults.len = 0;
655 }
656 
657 static int xe_vma_ops_alloc(struct xe_vma_ops *vops, bool array_of_binds)
658 {
659 	int i;
660 
661 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i) {
662 		if (!vops->pt_update_ops[i].num_ops)
663 			continue;
664 
665 		vops->pt_update_ops[i].ops =
666 			kmalloc_objs(*vops->pt_update_ops[i].ops,
667 				     vops->pt_update_ops[i].num_ops,
668 				     GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
669 		if (!vops->pt_update_ops[i].ops)
670 			return array_of_binds ? -ENOBUFS : -ENOMEM;
671 	}
672 
673 	return 0;
674 }
675 ALLOW_ERROR_INJECTION(xe_vma_ops_alloc, ERRNO);
676 
677 static void xe_vma_svm_prefetch_op_fini(struct xe_vma_op *op)
678 {
679 	struct xe_vma *vma;
680 
681 	vma = gpuva_to_vma(op->base.prefetch.va);
682 
683 	if (op->base.op == DRM_GPUVA_OP_PREFETCH && xe_vma_is_cpu_addr_mirror(vma))
684 		xa_destroy(&op->prefetch_range.range);
685 }
686 
687 static void xe_vma_svm_prefetch_ops_fini(struct xe_vma_ops *vops)
688 {
689 	struct xe_vma_op *op;
690 
691 	if (!(vops->flags & XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH))
692 		return;
693 
694 	list_for_each_entry(op, &vops->list, link)
695 		xe_vma_svm_prefetch_op_fini(op);
696 }
697 
698 static void xe_vma_ops_fini(struct xe_vma_ops *vops)
699 {
700 	int i;
701 
702 	xe_vma_svm_prefetch_ops_fini(vops);
703 
704 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
705 		kfree(vops->pt_update_ops[i].ops);
706 }
707 
708 static void xe_vma_ops_incr_pt_update_ops(struct xe_vma_ops *vops, u8 tile_mask, int inc_val)
709 {
710 	int i;
711 
712 	if (!inc_val)
713 		return;
714 
715 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
716 		if (BIT(i) & tile_mask)
717 			vops->pt_update_ops[i].num_ops += inc_val;
718 }
719 
720 #define XE_VMA_CREATE_MASK (		    \
721 	XE_VMA_READ_ONLY |		    \
722 	XE_VMA_DUMPABLE |		    \
723 	XE_VMA_SYSTEM_ALLOCATOR |           \
724 	DRM_GPUVA_SPARSE |		    \
725 	XE_VMA_MADV_AUTORESET)
726 
727 static void xe_vm_populate_rebind(struct xe_vma_op *op, struct xe_vma *vma,
728 				  u8 tile_mask)
729 {
730 	INIT_LIST_HEAD(&op->link);
731 	op->tile_mask = tile_mask;
732 	op->base.op = DRM_GPUVA_OP_MAP;
733 	op->base.map.va.addr = vma->gpuva.va.addr;
734 	op->base.map.va.range = vma->gpuva.va.range;
735 	op->base.map.gem.obj = vma->gpuva.gem.obj;
736 	op->base.map.gem.offset = vma->gpuva.gem.offset;
737 	op->map.vma = vma;
738 	op->map.immediate = true;
739 	op->map.vma_flags = vma->gpuva.flags & XE_VMA_CREATE_MASK;
740 }
741 
742 static int xe_vm_ops_add_rebind(struct xe_vma_ops *vops, struct xe_vma *vma,
743 				u8 tile_mask)
744 {
745 	struct xe_vma_op *op;
746 
747 	op = kzalloc_obj(*op);
748 	if (!op)
749 		return -ENOMEM;
750 
751 	xe_vm_populate_rebind(op, vma, tile_mask);
752 	list_add_tail(&op->link, &vops->list);
753 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask, 1);
754 
755 	return 0;
756 }
757 
758 static struct dma_fence *ops_execute(struct xe_vm *vm,
759 				     struct xe_vma_ops *vops);
760 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
761 			    struct xe_exec_queue *q,
762 			    struct xe_sync_entry *syncs, u32 num_syncs);
763 
764 int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
765 {
766 	struct dma_fence *fence;
767 	struct xe_vma *vma, *next;
768 	struct xe_vma_ops vops;
769 	struct xe_vma_op *op, *next_op;
770 	int err, i;
771 
772 	lockdep_assert_held(&vm->lock);
773 	if ((xe_vm_in_lr_mode(vm) && !rebind_worker) ||
774 	    list_empty(&vm->rebind_list))
775 		return 0;
776 
777 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
778 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
779 		vops.pt_update_ops[i].wait_vm_bookkeep = true;
780 
781 	xe_vm_assert_held(vm);
782 	list_for_each_entry(vma, &vm->rebind_list, combined_links.rebind) {
783 		xe_assert(vm->xe, vma->tile_present);
784 
785 		if (rebind_worker)
786 			trace_xe_vma_rebind_worker(vma);
787 		else
788 			trace_xe_vma_rebind_exec(vma);
789 
790 		err = xe_vm_ops_add_rebind(&vops, vma,
791 					   vma->tile_present);
792 		if (err)
793 			goto free_ops;
794 	}
795 
796 	err = xe_vma_ops_alloc(&vops, false);
797 	if (err)
798 		goto free_ops;
799 
800 	fence = ops_execute(vm, &vops);
801 	if (IS_ERR(fence)) {
802 		err = PTR_ERR(fence);
803 	} else {
804 		dma_fence_put(fence);
805 		list_for_each_entry_safe(vma, next, &vm->rebind_list,
806 					 combined_links.rebind)
807 			list_del_init(&vma->combined_links.rebind);
808 	}
809 free_ops:
810 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
811 		list_del(&op->link);
812 		kfree(op);
813 	}
814 	xe_vma_ops_fini(&vops);
815 
816 	return err;
817 }
818 
819 struct dma_fence *xe_vma_rebind(struct xe_vm *vm, struct xe_vma *vma, u8 tile_mask)
820 {
821 	struct dma_fence *fence = NULL;
822 	struct xe_vma_ops vops;
823 	struct xe_vma_op *op, *next_op;
824 	struct xe_tile *tile;
825 	u8 id;
826 	int err;
827 
828 	lockdep_assert_held(&vm->lock);
829 	xe_vm_assert_held(vm);
830 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
831 
832 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
833 	vops.flags |= XE_VMA_OPS_FLAG_SKIP_TLB_WAIT;
834 	for_each_tile(tile, vm->xe, id) {
835 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
836 		vops.pt_update_ops[tile->id].q =
837 			xe_migrate_exec_queue(tile->migrate);
838 	}
839 
840 	err = xe_vm_ops_add_rebind(&vops, vma, tile_mask);
841 	if (err)
842 		return ERR_PTR(err);
843 
844 	err = xe_vma_ops_alloc(&vops, false);
845 	if (err) {
846 		fence = ERR_PTR(err);
847 		goto free_ops;
848 	}
849 
850 	fence = ops_execute(vm, &vops);
851 
852 free_ops:
853 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
854 		list_del(&op->link);
855 		kfree(op);
856 	}
857 	xe_vma_ops_fini(&vops);
858 
859 	return fence;
860 }
861 
862 static void xe_vm_populate_range_rebind(struct xe_vma_op *op,
863 					struct xe_vma *vma,
864 					struct xe_svm_range *range,
865 					u8 tile_mask)
866 {
867 	INIT_LIST_HEAD(&op->link);
868 	op->tile_mask = tile_mask;
869 	op->base.op = DRM_GPUVA_OP_DRIVER;
870 	op->subop = XE_VMA_SUBOP_MAP_RANGE;
871 	op->map_range.vma = vma;
872 	op->map_range.range = range;
873 }
874 
875 static int
876 xe_vm_ops_add_range_rebind(struct xe_vma_ops *vops,
877 			   struct xe_vma *vma,
878 			   struct xe_svm_range *range,
879 			   u8 tile_mask)
880 {
881 	struct xe_vma_op *op;
882 
883 	op = kzalloc_obj(*op);
884 	if (!op)
885 		return -ENOMEM;
886 
887 	xe_vm_populate_range_rebind(op, vma, range, tile_mask);
888 	list_add_tail(&op->link, &vops->list);
889 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask, 1);
890 
891 	return 0;
892 }
893 
894 /**
895  * xe_vm_range_rebind() - VM range (re)bind
896  * @vm: The VM which the range belongs to.
897  * @vma: The VMA which the range belongs to.
898  * @range: SVM range to rebind.
899  * @tile_mask: Tile mask to bind the range to.
900  *
901  * (re)bind SVM range setting up GPU page tables for the range.
902  *
903  * Return: dma fence for rebind to signal completion on success, ERR_PTR on
904  * failure
905  */
906 struct dma_fence *xe_vm_range_rebind(struct xe_vm *vm,
907 				     struct xe_vma *vma,
908 				     struct xe_svm_range *range,
909 				     u8 tile_mask)
910 {
911 	struct dma_fence *fence = NULL;
912 	struct xe_vma_ops vops;
913 	struct xe_vma_op *op, *next_op;
914 	struct xe_tile *tile;
915 	u8 id;
916 	int err;
917 
918 	lockdep_assert_held(&vm->lock);
919 	xe_vm_assert_held(vm);
920 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
921 	xe_assert(vm->xe, xe_vma_is_cpu_addr_mirror(vma));
922 
923 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
924 	vops.flags |= XE_VMA_OPS_FLAG_SKIP_TLB_WAIT;
925 	for_each_tile(tile, vm->xe, id) {
926 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
927 		vops.pt_update_ops[tile->id].q =
928 			xe_migrate_exec_queue(tile->migrate);
929 	}
930 
931 	err = xe_vm_ops_add_range_rebind(&vops, vma, range, tile_mask);
932 	if (err)
933 		return ERR_PTR(err);
934 
935 	err = xe_vma_ops_alloc(&vops, false);
936 	if (err) {
937 		fence = ERR_PTR(err);
938 		goto free_ops;
939 	}
940 
941 	fence = ops_execute(vm, &vops);
942 
943 free_ops:
944 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
945 		list_del(&op->link);
946 		kfree(op);
947 	}
948 	xe_vma_ops_fini(&vops);
949 
950 	return fence;
951 }
952 
953 static void xe_vm_populate_range_unbind(struct xe_vma_op *op,
954 					struct xe_svm_range *range)
955 {
956 	INIT_LIST_HEAD(&op->link);
957 	op->tile_mask = range->tile_present;
958 	op->base.op = DRM_GPUVA_OP_DRIVER;
959 	op->subop = XE_VMA_SUBOP_UNMAP_RANGE;
960 	op->unmap_range.range = range;
961 }
962 
963 static int
964 xe_vm_ops_add_range_unbind(struct xe_vma_ops *vops,
965 			   struct xe_svm_range *range)
966 {
967 	struct xe_vma_op *op;
968 
969 	op = kzalloc_obj(*op);
970 	if (!op)
971 		return -ENOMEM;
972 
973 	xe_vm_populate_range_unbind(op, range);
974 	list_add_tail(&op->link, &vops->list);
975 	xe_vma_ops_incr_pt_update_ops(vops, range->tile_present, 1);
976 
977 	return 0;
978 }
979 
980 /**
981  * xe_vm_range_unbind() - VM range unbind
982  * @vm: The VM which the range belongs to.
983  * @range: SVM range to rebind.
984  *
985  * Unbind SVM range removing the GPU page tables for the range.
986  *
987  * Return: dma fence for unbind to signal completion on success, ERR_PTR on
988  * failure
989  */
990 struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm,
991 				     struct xe_svm_range *range)
992 {
993 	struct dma_fence *fence = NULL;
994 	struct xe_vma_ops vops;
995 	struct xe_vma_op *op, *next_op;
996 	struct xe_tile *tile;
997 	u8 id;
998 	int err;
999 
1000 	lockdep_assert_held(&vm->lock);
1001 	xe_vm_assert_held(vm);
1002 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
1003 
1004 	if (!range->tile_present)
1005 		return dma_fence_get_stub();
1006 
1007 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
1008 	for_each_tile(tile, vm->xe, id) {
1009 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
1010 		vops.pt_update_ops[tile->id].q =
1011 			xe_migrate_exec_queue(tile->migrate);
1012 	}
1013 
1014 	err = xe_vm_ops_add_range_unbind(&vops, range);
1015 	if (err)
1016 		return ERR_PTR(err);
1017 
1018 	err = xe_vma_ops_alloc(&vops, false);
1019 	if (err) {
1020 		fence = ERR_PTR(err);
1021 		goto free_ops;
1022 	}
1023 
1024 	fence = ops_execute(vm, &vops);
1025 
1026 free_ops:
1027 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
1028 		list_del(&op->link);
1029 		kfree(op);
1030 	}
1031 	xe_vma_ops_fini(&vops);
1032 
1033 	return fence;
1034 }
1035 
1036 static void xe_vma_mem_attr_fini(struct xe_vma_mem_attr *attr)
1037 {
1038 	drm_pagemap_put(attr->preferred_loc.dpagemap);
1039 }
1040 
1041 static void xe_vma_free(struct xe_vma *vma)
1042 {
1043 	xe_vma_mem_attr_fini(&vma->attr);
1044 
1045 	if (xe_vma_is_userptr(vma))
1046 		kfree(to_userptr_vma(vma));
1047 	else
1048 		kfree(vma);
1049 }
1050 
1051 /**
1052  * xe_vma_mem_attr_copy() - copy an xe_vma_mem_attr structure.
1053  * @to: Destination.
1054  * @from: Source.
1055  *
1056  * Copies an xe_vma_mem_attr structure taking care to get reference
1057  * counting of individual members right.
1058  */
1059 void xe_vma_mem_attr_copy(struct xe_vma_mem_attr *to, struct xe_vma_mem_attr *from)
1060 {
1061 	xe_vma_mem_attr_fini(to);
1062 	*to = *from;
1063 	if (to->preferred_loc.dpagemap)
1064 		drm_pagemap_get(to->preferred_loc.dpagemap);
1065 }
1066 
1067 static struct xe_vma *xe_vma_create(struct xe_vm *vm,
1068 				    struct xe_bo *bo,
1069 				    u64 bo_offset_or_userptr,
1070 				    u64 start, u64 end,
1071 				    struct xe_vma_mem_attr *attr,
1072 				    unsigned int flags)
1073 {
1074 	struct xe_vma *vma;
1075 	struct xe_tile *tile;
1076 	u8 id;
1077 	bool is_null = (flags & DRM_GPUVA_SPARSE);
1078 	bool is_cpu_addr_mirror = (flags & XE_VMA_SYSTEM_ALLOCATOR);
1079 
1080 	xe_assert(vm->xe, start < end);
1081 	xe_assert(vm->xe, end < vm->size);
1082 
1083 	/*
1084 	 * Allocate and ensure that the xe_vma_is_userptr() return
1085 	 * matches what was allocated.
1086 	 */
1087 	if (!bo && !is_null && !is_cpu_addr_mirror) {
1088 		struct xe_userptr_vma *uvma = kzalloc_obj(*uvma);
1089 
1090 		if (!uvma)
1091 			return ERR_PTR(-ENOMEM);
1092 
1093 		vma = &uvma->vma;
1094 	} else {
1095 		vma = kzalloc_obj(*vma);
1096 		if (!vma)
1097 			return ERR_PTR(-ENOMEM);
1098 
1099 		if (bo)
1100 			vma->gpuva.gem.obj = &bo->ttm.base;
1101 	}
1102 
1103 	INIT_LIST_HEAD(&vma->combined_links.rebind);
1104 
1105 	INIT_LIST_HEAD(&vma->gpuva.gem.entry);
1106 	vma->gpuva.vm = &vm->gpuvm;
1107 	vma->gpuva.va.addr = start;
1108 	vma->gpuva.va.range = end - start + 1;
1109 	vma->gpuva.flags = flags;
1110 
1111 	for_each_tile(tile, vm->xe, id)
1112 		vma->tile_mask |= 0x1 << id;
1113 
1114 	if (vm->xe->info.has_atomic_enable_pte_bit)
1115 		vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
1116 
1117 	xe_vma_mem_attr_copy(&vma->attr, attr);
1118 	if (bo) {
1119 		struct drm_gpuvm_bo *vm_bo;
1120 
1121 		xe_bo_assert_held(bo);
1122 
1123 		vm_bo = drm_gpuvm_bo_obtain_locked(vma->gpuva.vm, &bo->ttm.base);
1124 		if (IS_ERR(vm_bo)) {
1125 			xe_vma_free(vma);
1126 			return ERR_CAST(vm_bo);
1127 		}
1128 
1129 		drm_gpuvm_bo_extobj_add(vm_bo);
1130 		drm_gem_object_get(&bo->ttm.base);
1131 		vma->gpuva.gem.offset = bo_offset_or_userptr;
1132 		drm_gpuva_link(&vma->gpuva, vm_bo);
1133 		drm_gpuvm_bo_put(vm_bo);
1134 	} else /* userptr or null */ {
1135 		if (!is_null && !is_cpu_addr_mirror) {
1136 			struct xe_userptr_vma *uvma = to_userptr_vma(vma);
1137 			u64 size = end - start + 1;
1138 			int err;
1139 
1140 			vma->gpuva.gem.offset = bo_offset_or_userptr;
1141 
1142 			err = xe_userptr_setup(uvma, xe_vma_userptr(vma), size);
1143 			if (err) {
1144 				xe_vma_free(vma);
1145 				return ERR_PTR(err);
1146 			}
1147 		}
1148 
1149 		xe_vm_get(vm);
1150 	}
1151 
1152 	return vma;
1153 }
1154 
1155 static void xe_vma_destroy_late(struct xe_vma *vma)
1156 {
1157 	struct xe_vm *vm = xe_vma_vm(vma);
1158 	struct xe_bo *bo = xe_vma_bo(vma);
1159 
1160 	if (vma->ufence) {
1161 		xe_sync_ufence_put(vma->ufence);
1162 		vma->ufence = NULL;
1163 	}
1164 
1165 	if (xe_vma_is_userptr(vma)) {
1166 		struct xe_userptr_vma *uvma = to_userptr_vma(vma);
1167 
1168 		xe_userptr_remove(uvma);
1169 		xe_vm_put(vm);
1170 	} else if (xe_vma_is_null(vma) || xe_vma_is_cpu_addr_mirror(vma)) {
1171 		xe_vm_put(vm);
1172 	} else {
1173 		xe_bo_put(bo);
1174 	}
1175 
1176 	xe_vma_free(vma);
1177 }
1178 
1179 static void vma_destroy_work_func(struct work_struct *w)
1180 {
1181 	struct xe_vma *vma =
1182 		container_of(w, struct xe_vma, destroy_work);
1183 
1184 	xe_vma_destroy_late(vma);
1185 }
1186 
1187 static void vma_destroy_cb(struct dma_fence *fence,
1188 			   struct dma_fence_cb *cb)
1189 {
1190 	struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
1191 
1192 	INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
1193 	queue_work(system_dfl_wq, &vma->destroy_work);
1194 }
1195 
1196 static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
1197 {
1198 	struct xe_vm *vm = xe_vma_vm(vma);
1199 	struct xe_bo *bo = xe_vma_bo(vma);
1200 
1201 	lockdep_assert_held_write(&vm->lock);
1202 	xe_assert(vm->xe, list_empty(&vma->combined_links.destroy));
1203 
1204 	if (xe_vma_is_userptr(vma)) {
1205 		xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED);
1206 		xe_userptr_destroy(to_userptr_vma(vma));
1207 	} else if (!xe_vma_is_null(vma) && !xe_vma_is_cpu_addr_mirror(vma)) {
1208 		xe_bo_assert_held(bo);
1209 
1210 		drm_gpuva_unlink(&vma->gpuva);
1211 		xe_bo_recompute_purgeable_state(bo);
1212 	}
1213 
1214 	xe_vm_assert_held(vm);
1215 	if (fence) {
1216 		int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
1217 						 vma_destroy_cb);
1218 
1219 		if (ret) {
1220 			XE_WARN_ON(ret != -ENOENT);
1221 			xe_vma_destroy_late(vma);
1222 		}
1223 	} else {
1224 		xe_vma_destroy_late(vma);
1225 	}
1226 }
1227 
1228 /**
1229  * xe_vm_lock_vma() - drm_exec utility to lock a vma
1230  * @exec: The drm_exec object we're currently locking for.
1231  * @vma: The vma for witch we want to lock the vm resv and any attached
1232  * object's resv.
1233  *
1234  * Return: 0 on success, negative error code on error. In particular
1235  * may return -EDEADLK on WW transaction contention and -EINTR if
1236  * an interruptible wait is terminated by a signal.
1237  */
1238 int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
1239 {
1240 	struct xe_vm *vm = xe_vma_vm(vma);
1241 	struct xe_bo *bo = xe_vma_bo(vma);
1242 	int err;
1243 
1244 	XE_WARN_ON(!vm);
1245 
1246 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
1247 	if (!err && bo && !bo->vm)
1248 		err = drm_exec_lock_obj(exec, &bo->ttm.base);
1249 
1250 	return err;
1251 }
1252 
1253 static void xe_vma_destroy_unlocked(struct xe_vma *vma)
1254 {
1255 	struct xe_device *xe = xe_vma_vm(vma)->xe;
1256 	struct xe_validation_ctx ctx;
1257 	struct drm_exec exec;
1258 	int err = 0;
1259 
1260 	xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {}, err) {
1261 		err = xe_vm_lock_vma(&exec, vma);
1262 		drm_exec_retry_on_contention(&exec);
1263 		if (XE_WARN_ON(err))
1264 			break;
1265 		xe_vma_destroy(vma, NULL);
1266 	}
1267 	xe_assert(xe, !err);
1268 }
1269 
1270 struct xe_vma *
1271 xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
1272 {
1273 	struct drm_gpuva *gpuva;
1274 
1275 	lockdep_assert_held(&vm->lock);
1276 
1277 	if (xe_vm_is_closed_or_banned(vm))
1278 		return NULL;
1279 
1280 	xe_assert(vm->xe, start + range <= vm->size);
1281 
1282 	gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
1283 
1284 	return gpuva ? gpuva_to_vma(gpuva) : NULL;
1285 }
1286 
1287 static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
1288 {
1289 	int err;
1290 
1291 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1292 	lockdep_assert_held(&vm->lock);
1293 
1294 	mutex_lock(&vm->snap_mutex);
1295 	err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
1296 	mutex_unlock(&vm->snap_mutex);
1297 	XE_WARN_ON(err);	/* Shouldn't be possible */
1298 
1299 	return err;
1300 }
1301 
1302 static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
1303 {
1304 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1305 	lockdep_assert_held(&vm->lock);
1306 
1307 	mutex_lock(&vm->snap_mutex);
1308 	drm_gpuva_remove(&vma->gpuva);
1309 	mutex_unlock(&vm->snap_mutex);
1310 	if (vm->usm.last_fault_vma == vma)
1311 		vm->usm.last_fault_vma = NULL;
1312 }
1313 
1314 static struct drm_gpuva_op *xe_vm_op_alloc(void)
1315 {
1316 	struct xe_vma_op *op;
1317 
1318 	op = kzalloc_obj(*op);
1319 
1320 	if (unlikely(!op))
1321 		return NULL;
1322 
1323 	return &op->base;
1324 }
1325 
1326 static void xe_vm_free(struct drm_gpuvm *gpuvm);
1327 
1328 static const struct drm_gpuvm_ops gpuvm_ops = {
1329 	.op_alloc = xe_vm_op_alloc,
1330 	.vm_bo_validate = xe_gpuvm_validate,
1331 	.vm_free = xe_vm_free,
1332 };
1333 
1334 static u64 pde_encode_pat_index(u16 pat_index)
1335 {
1336 	u64 pte = 0;
1337 
1338 	if (pat_index & BIT(0))
1339 		pte |= XE_PPGTT_PTE_PAT0;
1340 
1341 	if (pat_index & BIT(1))
1342 		pte |= XE_PPGTT_PTE_PAT1;
1343 
1344 	return pte;
1345 }
1346 
1347 static u64 pte_encode_pat_index(u16 pat_index, u32 pt_level)
1348 {
1349 	u64 pte = 0;
1350 
1351 	if (pat_index & BIT(0))
1352 		pte |= XE_PPGTT_PTE_PAT0;
1353 
1354 	if (pat_index & BIT(1))
1355 		pte |= XE_PPGTT_PTE_PAT1;
1356 
1357 	if (pat_index & BIT(2)) {
1358 		if (pt_level)
1359 			pte |= XE_PPGTT_PDE_PDPE_PAT2;
1360 		else
1361 			pte |= XE_PPGTT_PTE_PAT2;
1362 	}
1363 
1364 	if (pat_index & BIT(3))
1365 		pte |= XELPG_PPGTT_PTE_PAT3;
1366 
1367 	if (pat_index & (BIT(4)))
1368 		pte |= XE2_PPGTT_PTE_PAT4;
1369 
1370 	return pte;
1371 }
1372 
1373 static u64 pte_encode_ps(u32 pt_level)
1374 {
1375 	XE_WARN_ON(pt_level > MAX_HUGEPTE_LEVEL);
1376 
1377 	if (pt_level == 1)
1378 		return XE_PDE_PS_2M;
1379 	else if (pt_level == 2)
1380 		return XE_PDPE_PS_1G;
1381 
1382 	return 0;
1383 }
1384 
1385 static u16 pde_pat_index(struct xe_bo *bo)
1386 {
1387 	struct xe_device *xe = xe_bo_device(bo);
1388 	u16 pat_index;
1389 
1390 	/*
1391 	 * We only have two bits to encode the PAT index in non-leaf nodes, but
1392 	 * these only point to other paging structures so we only need a minimal
1393 	 * selection of options. The user PAT index is only for encoding leaf
1394 	 * nodes, where we have use of more bits to do the encoding. The
1395 	 * non-leaf nodes are instead under driver control so the chosen index
1396 	 * here should be distinct from the user PAT index. Also the
1397 	 * corresponding coherency of the PAT index should be tied to the
1398 	 * allocation type of the page table (or at least we should pick
1399 	 * something which is always safe).
1400 	 */
1401 	if (!xe_bo_is_vram(bo) && bo->ttm.ttm->caching == ttm_cached)
1402 		pat_index = xe->pat.idx[XE_CACHE_WB];
1403 	else
1404 		pat_index = xe->pat.idx[XE_CACHE_NONE];
1405 
1406 	xe_assert(xe, pat_index <= 3);
1407 
1408 	return pat_index;
1409 }
1410 
1411 static u64 xelp_pde_encode_bo(struct xe_bo *bo, u64 bo_offset)
1412 {
1413 	u64 pde;
1414 
1415 	pde = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1416 	pde |= XE_PAGE_PRESENT | XE_PAGE_RW;
1417 	pde |= pde_encode_pat_index(pde_pat_index(bo));
1418 
1419 	return pde;
1420 }
1421 
1422 static u64 xelp_pte_encode_bo(struct xe_bo *bo, u64 bo_offset,
1423 			      u16 pat_index, u32 pt_level)
1424 {
1425 	u64 pte;
1426 
1427 	pte = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1428 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1429 	pte |= pte_encode_pat_index(pat_index, pt_level);
1430 	pte |= pte_encode_ps(pt_level);
1431 
1432 	if (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo))
1433 		pte |= XE_PPGTT_PTE_DM;
1434 
1435 	return pte;
1436 }
1437 
1438 static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma,
1439 			       u16 pat_index, u32 pt_level)
1440 {
1441 	struct xe_bo *bo = xe_vma_bo(vma);
1442 	struct xe_vm *vm = xe_vma_vm(vma);
1443 
1444 	pte |= XE_PAGE_PRESENT;
1445 
1446 	if (likely(!xe_vma_read_only(vma)))
1447 		pte |= XE_PAGE_RW;
1448 
1449 	pte |= pte_encode_pat_index(pat_index, pt_level);
1450 	pte |= pte_encode_ps(pt_level);
1451 
1452 	/*
1453 	 * NULL PTEs redirect to scratch page (return zeros on read).
1454 	 * Set for: 1) explicit null VMAs, 2) purged BOs on scratch VMs.
1455 	 * Never set NULL flag without scratch page - causes undefined behavior.
1456 	 */
1457 	if (unlikely(xe_vma_is_null(vma) ||
1458 		     (bo && xe_bo_is_purged(bo) && xe_vm_has_scratch(vm))))
1459 		pte |= XE_PTE_NULL;
1460 
1461 	return pte;
1462 }
1463 
1464 static u64 xelp_pte_encode_addr(struct xe_device *xe, u64 addr,
1465 				u16 pat_index,
1466 				u32 pt_level, bool devmem, u64 flags)
1467 {
1468 	u64 pte;
1469 
1470 	/* Avoid passing random bits directly as flags */
1471 	xe_assert(xe, !(flags & ~XE_PTE_PS64));
1472 
1473 	pte = addr;
1474 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1475 	pte |= pte_encode_pat_index(pat_index, pt_level);
1476 	pte |= pte_encode_ps(pt_level);
1477 
1478 	if (devmem)
1479 		pte |= XE_PPGTT_PTE_DM;
1480 
1481 	pte |= flags;
1482 
1483 	return pte;
1484 }
1485 
1486 static const struct xe_pt_ops xelp_pt_ops = {
1487 	.pte_encode_bo = xelp_pte_encode_bo,
1488 	.pte_encode_vma = xelp_pte_encode_vma,
1489 	.pte_encode_addr = xelp_pte_encode_addr,
1490 	.pde_encode_bo = xelp_pde_encode_bo,
1491 };
1492 
1493 static void vm_destroy_work_func(struct work_struct *w);
1494 
1495 /**
1496  * xe_vm_create_scratch() - Setup a scratch memory pagetable tree for the
1497  * given tile and vm.
1498  * @xe: xe device.
1499  * @tile: tile to set up for.
1500  * @vm: vm to set up for.
1501  * @exec: The struct drm_exec object used to lock the vm resv.
1502  *
1503  * Sets up a pagetable tree with one page-table per level and a single
1504  * leaf PTE. All pagetable entries point to the single page-table or,
1505  * for MAX_HUGEPTE_LEVEL, a NULL huge PTE returning 0 on read and
1506  * writes become NOPs.
1507  *
1508  * Return: 0 on success, negative error code on error.
1509  */
1510 static int xe_vm_create_scratch(struct xe_device *xe, struct xe_tile *tile,
1511 				struct xe_vm *vm, struct drm_exec *exec)
1512 {
1513 	u8 id = tile->id;
1514 	int i;
1515 
1516 	for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; i++) {
1517 		vm->scratch_pt[id][i] = xe_pt_create(vm, tile, i, exec);
1518 		if (IS_ERR(vm->scratch_pt[id][i])) {
1519 			int err = PTR_ERR(vm->scratch_pt[id][i]);
1520 
1521 			vm->scratch_pt[id][i] = NULL;
1522 			return err;
1523 		}
1524 		xe_pt_populate_empty(tile, vm, vm->scratch_pt[id][i]);
1525 	}
1526 
1527 	return 0;
1528 }
1529 ALLOW_ERROR_INJECTION(xe_vm_create_scratch, ERRNO);
1530 
1531 static void xe_vm_free_scratch(struct xe_vm *vm)
1532 {
1533 	struct xe_tile *tile;
1534 	u8 id;
1535 
1536 	if (!xe_vm_has_scratch(vm))
1537 		return;
1538 
1539 	for_each_tile(tile, vm->xe, id) {
1540 		u32 i;
1541 
1542 		if (!vm->pt_root[id])
1543 			continue;
1544 
1545 		for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; ++i)
1546 			if (vm->scratch_pt[id][i])
1547 				xe_pt_destroy(vm->scratch_pt[id][i], vm->flags, NULL);
1548 	}
1549 }
1550 
1551 static void xe_vm_pt_destroy(struct xe_vm *vm)
1552 {
1553 	struct xe_tile *tile;
1554 	u8 id;
1555 
1556 	xe_vm_assert_held(vm);
1557 
1558 	for_each_tile(tile, vm->xe, id) {
1559 		if (vm->pt_root[id]) {
1560 			xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
1561 			vm->pt_root[id] = NULL;
1562 		}
1563 	}
1564 }
1565 
1566 static void xe_vm_init_prove_locking(struct xe_device *xe, struct xe_vm *vm)
1567 {
1568 	if (!IS_ENABLED(CONFIG_PROVE_LOCKING))
1569 		return;
1570 
1571 	fs_reclaim_acquire(GFP_KERNEL);
1572 	might_lock(&vm->exec_queues.lock);
1573 	fs_reclaim_release(GFP_KERNEL);
1574 
1575 	down_read(&vm->exec_queues.lock);
1576 	might_lock(&xe_root_mmio_gt(xe)->uc.guc.ct.lock);
1577 	up_read(&vm->exec_queues.lock);
1578 }
1579 
1580 struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef)
1581 {
1582 	struct drm_gem_object *vm_resv_obj;
1583 	struct xe_validation_ctx ctx;
1584 	struct drm_exec exec;
1585 	struct xe_vm *vm;
1586 	int err;
1587 	struct xe_tile *tile;
1588 	u8 id;
1589 
1590 	/*
1591 	 * Since the GSCCS is not user-accessible, we don't expect a GSC VM to
1592 	 * ever be in faulting mode.
1593 	 */
1594 	xe_assert(xe, !((flags & XE_VM_FLAG_GSC) && (flags & XE_VM_FLAG_FAULT_MODE)));
1595 
1596 	vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1597 	if (!vm)
1598 		return ERR_PTR(-ENOMEM);
1599 
1600 	vm->xe = xe;
1601 
1602 	vm->size = 1ull << xe->info.va_bits;
1603 	vm->flags = flags;
1604 
1605 	if (xef)
1606 		vm->xef = xe_file_get(xef);
1607 	/**
1608 	 * GSC VMs are kernel-owned, only used for PXP ops and can sometimes be
1609 	 * manipulated under the PXP mutex. However, the PXP mutex can be taken
1610 	 * under a user-VM lock when the PXP session is started at exec_queue
1611 	 * creation time. Those are different VMs and therefore there is no risk
1612 	 * of deadlock, but we need to tell lockdep that this is the case or it
1613 	 * will print a warning.
1614 	 */
1615 	if (flags & XE_VM_FLAG_GSC) {
1616 		static struct lock_class_key gsc_vm_key;
1617 
1618 		__init_rwsem(&vm->lock, "gsc_vm", &gsc_vm_key);
1619 	} else {
1620 		init_rwsem(&vm->lock);
1621 	}
1622 	mutex_init(&vm->snap_mutex);
1623 
1624 	INIT_LIST_HEAD(&vm->rebind_list);
1625 
1626 	INIT_LIST_HEAD(&vm->userptr.repin_list);
1627 	INIT_LIST_HEAD(&vm->userptr.invalidated);
1628 	spin_lock_init(&vm->userptr.invalidated_lock);
1629 
1630 	INIT_LIST_HEAD(&vm->faults.list);
1631 	spin_lock_init(&vm->faults.lock);
1632 
1633 	ttm_lru_bulk_move_init(&vm->lru_bulk_move);
1634 
1635 	INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
1636 
1637 	INIT_LIST_HEAD(&vm->preempt.exec_queues);
1638 	for (id = 0; id < XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE; ++id)
1639 		INIT_LIST_HEAD(&vm->exec_queues.list[id]);
1640 	if (flags & XE_VM_FLAG_FAULT_MODE)
1641 		vm->preempt.min_run_period_ms = xe->min_run_period_pf_ms;
1642 	else
1643 		vm->preempt.min_run_period_ms = xe->min_run_period_lr_ms;
1644 
1645 	init_rwsem(&vm->exec_queues.lock);
1646 	xe_vm_init_prove_locking(xe, vm);
1647 
1648 	for_each_tile(tile, xe, id)
1649 		xe_range_fence_tree_init(&vm->rftree[id]);
1650 
1651 	vm->pt_ops = &xelp_pt_ops;
1652 
1653 	/*
1654 	 * Long-running workloads are not protected by the scheduler references.
1655 	 * By design, run_job for long-running workloads returns NULL and the
1656 	 * scheduler drops all the references of it, hence protecting the VM
1657 	 * for this case is necessary.
1658 	 */
1659 	if (flags & XE_VM_FLAG_LR_MODE) {
1660 		INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1661 		xe_pm_runtime_get_noresume(xe);
1662 		INIT_LIST_HEAD(&vm->preempt.pm_activate_link);
1663 	}
1664 
1665 	err = xe_svm_init(vm);
1666 	if (err)
1667 		goto err_no_resv;
1668 
1669 	vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
1670 	if (!vm_resv_obj) {
1671 		err = -ENOMEM;
1672 		goto err_svm_fini;
1673 	}
1674 
1675 	drm_gpuvm_init(&vm->gpuvm, "Xe VM", DRM_GPUVM_RESV_PROTECTED, &xe->drm,
1676 		       vm_resv_obj, 0, vm->size, 0, 0, &gpuvm_ops);
1677 
1678 	drm_gem_object_put(vm_resv_obj);
1679 
1680 	err = 0;
1681 	xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {.interruptible = true},
1682 			    err) {
1683 		err = xe_vm_drm_exec_lock(vm, &exec);
1684 		drm_exec_retry_on_contention(&exec);
1685 
1686 		if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
1687 			vm->flags |= XE_VM_FLAG_64K;
1688 
1689 		for_each_tile(tile, xe, id) {
1690 			if (flags & XE_VM_FLAG_MIGRATION &&
1691 			    tile->id != XE_VM_FLAG_TILE_ID(flags))
1692 				continue;
1693 
1694 			vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level,
1695 						       &exec);
1696 			if (IS_ERR(vm->pt_root[id])) {
1697 				err = PTR_ERR(vm->pt_root[id]);
1698 				vm->pt_root[id] = NULL;
1699 				xe_vm_pt_destroy(vm);
1700 				drm_exec_retry_on_contention(&exec);
1701 				xe_validation_retry_on_oom(&ctx, &err);
1702 				break;
1703 			}
1704 		}
1705 		if (err)
1706 			break;
1707 
1708 		if (xe_vm_has_scratch(vm)) {
1709 			for_each_tile(tile, xe, id) {
1710 				if (!vm->pt_root[id])
1711 					continue;
1712 
1713 				err = xe_vm_create_scratch(xe, tile, vm, &exec);
1714 				if (err) {
1715 					xe_vm_free_scratch(vm);
1716 					xe_vm_pt_destroy(vm);
1717 					drm_exec_retry_on_contention(&exec);
1718 					xe_validation_retry_on_oom(&ctx, &err);
1719 					break;
1720 				}
1721 			}
1722 			if (err)
1723 				break;
1724 			vm->batch_invalidate_tlb = true;
1725 		}
1726 
1727 		if (vm->flags & XE_VM_FLAG_LR_MODE) {
1728 			INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1729 			vm->batch_invalidate_tlb = false;
1730 		}
1731 
1732 		/* Fill pt_root after allocating scratch tables */
1733 		for_each_tile(tile, xe, id) {
1734 			if (!vm->pt_root[id])
1735 				continue;
1736 
1737 			xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
1738 		}
1739 	}
1740 	if (err)
1741 		goto err_close;
1742 
1743 	/* Kernel migration VM shouldn't have a circular loop.. */
1744 	if (!(flags & XE_VM_FLAG_MIGRATION)) {
1745 		for_each_tile(tile, xe, id) {
1746 			struct xe_exec_queue *q;
1747 			u32 create_flags = EXEC_QUEUE_FLAG_VM;
1748 
1749 			if (!vm->pt_root[id])
1750 				continue;
1751 
1752 			if (!xef) /* Not from userspace */
1753 				create_flags |= EXEC_QUEUE_FLAG_KERNEL;
1754 
1755 			q = xe_exec_queue_create_bind(xe, tile, vm, create_flags, 0);
1756 			if (IS_ERR(q)) {
1757 				err = PTR_ERR(q);
1758 				goto err_close;
1759 			}
1760 			vm->q[id] = q;
1761 		}
1762 	}
1763 
1764 	if (xef && xe->info.has_asid) {
1765 		u32 asid;
1766 
1767 		down_write(&xe->usm.lock);
1768 		err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
1769 				      XA_LIMIT(1, XE_MAX_ASID - 1),
1770 				      &xe->usm.next_asid, GFP_NOWAIT);
1771 		up_write(&xe->usm.lock);
1772 		if (err < 0)
1773 			goto err_close;
1774 
1775 		vm->usm.asid = asid;
1776 	}
1777 
1778 	trace_xe_vm_create(vm);
1779 
1780 	return vm;
1781 
1782 err_close:
1783 	xe_vm_close_and_put(vm);
1784 	return ERR_PTR(err);
1785 
1786 err_svm_fini:
1787 	if (flags & XE_VM_FLAG_FAULT_MODE) {
1788 		vm->size = 0; /* close the vm */
1789 		xe_svm_fini(vm);
1790 	}
1791 err_no_resv:
1792 	mutex_destroy(&vm->snap_mutex);
1793 	for_each_tile(tile, xe, id)
1794 		xe_range_fence_tree_fini(&vm->rftree[id]);
1795 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
1796 	if (vm->xef)
1797 		xe_file_put(vm->xef);
1798 	kfree(vm);
1799 	if (flags & XE_VM_FLAG_LR_MODE)
1800 		xe_pm_runtime_put(xe);
1801 	return ERR_PTR(err);
1802 }
1803 
1804 static void xe_vm_close(struct xe_vm *vm)
1805 {
1806 	struct xe_device *xe = vm->xe;
1807 	bool bound;
1808 	int idx;
1809 
1810 	bound = drm_dev_enter(&xe->drm, &idx);
1811 
1812 	down_write(&vm->lock);
1813 	if (xe_vm_in_fault_mode(vm))
1814 		xe_svm_notifier_lock(vm);
1815 
1816 	vm->size = 0;
1817 
1818 	if (!((vm->flags & XE_VM_FLAG_MIGRATION))) {
1819 		struct xe_tile *tile;
1820 		struct xe_gt *gt;
1821 		u8 id;
1822 
1823 		/* Wait for pending binds */
1824 		dma_resv_wait_timeout(xe_vm_resv(vm),
1825 				      DMA_RESV_USAGE_BOOKKEEP,
1826 				      false, MAX_SCHEDULE_TIMEOUT);
1827 
1828 		if (bound) {
1829 			for_each_tile(tile, xe, id)
1830 				if (vm->pt_root[id])
1831 					xe_pt_clear(xe, vm->pt_root[id]);
1832 
1833 			for_each_gt(gt, xe, id)
1834 				xe_tlb_inval_vm(&gt->tlb_inval, vm);
1835 		}
1836 	}
1837 
1838 	if (xe_vm_in_fault_mode(vm))
1839 		xe_svm_notifier_unlock(vm);
1840 	up_write(&vm->lock);
1841 
1842 	if (bound)
1843 		drm_dev_exit(idx);
1844 }
1845 
1846 void xe_vm_close_and_put(struct xe_vm *vm)
1847 {
1848 	LIST_HEAD(contested);
1849 	struct xe_device *xe = vm->xe;
1850 	struct xe_tile *tile;
1851 	struct xe_vma *vma, *next_vma;
1852 	struct drm_gpuva *gpuva, *next;
1853 	u8 id;
1854 
1855 	xe_assert(xe, !vm->preempt.num_exec_queues);
1856 
1857 	xe_vm_close(vm);
1858 	if (xe_vm_in_preempt_fence_mode(vm)) {
1859 		mutex_lock(&xe->rebind_resume_lock);
1860 		list_del_init(&vm->preempt.pm_activate_link);
1861 		mutex_unlock(&xe->rebind_resume_lock);
1862 		flush_work(&vm->preempt.rebind_work);
1863 	}
1864 	if (xe_vm_in_fault_mode(vm))
1865 		xe_svm_close(vm);
1866 
1867 	down_write(&vm->lock);
1868 	for_each_tile(tile, xe, id) {
1869 		if (vm->q[id]) {
1870 			int i;
1871 
1872 			xe_exec_queue_last_fence_put(vm->q[id], vm);
1873 			for_each_tlb_inval(i)
1874 				xe_exec_queue_tlb_inval_last_fence_put(vm->q[id], vm, i);
1875 		}
1876 	}
1877 	up_write(&vm->lock);
1878 
1879 	for_each_tile(tile, xe, id) {
1880 		if (vm->q[id]) {
1881 			xe_exec_queue_kill(vm->q[id]);
1882 			xe_exec_queue_put(vm->q[id]);
1883 			vm->q[id] = NULL;
1884 		}
1885 	}
1886 
1887 	down_write(&vm->lock);
1888 	xe_vm_lock(vm, false);
1889 	drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
1890 		vma = gpuva_to_vma(gpuva);
1891 
1892 		if (xe_vma_has_no_bo(vma)) {
1893 			xe_svm_notifier_lock(vm);
1894 			vma->gpuva.flags |= XE_VMA_DESTROYED;
1895 			xe_svm_notifier_unlock(vm);
1896 		}
1897 
1898 		xe_vm_remove_vma(vm, vma);
1899 
1900 		/* easy case, remove from VMA? */
1901 		if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
1902 			list_del_init(&vma->combined_links.rebind);
1903 			xe_vma_destroy(vma, NULL);
1904 			continue;
1905 		}
1906 
1907 		list_move_tail(&vma->combined_links.destroy, &contested);
1908 		vma->gpuva.flags |= XE_VMA_DESTROYED;
1909 	}
1910 
1911 	/*
1912 	 * All vm operations will add shared fences to resv.
1913 	 * The only exception is eviction for a shared object,
1914 	 * but even so, the unbind when evicted would still
1915 	 * install a fence to resv. Hence it's safe to
1916 	 * destroy the pagetables immediately.
1917 	 */
1918 	xe_vm_free_scratch(vm);
1919 	xe_vm_pt_destroy(vm);
1920 	xe_vm_unlock(vm);
1921 
1922 	/*
1923 	 * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
1924 	 * Since we hold a refcount to the bo, we can remove and free
1925 	 * the members safely without locking.
1926 	 */
1927 	list_for_each_entry_safe(vma, next_vma, &contested,
1928 				 combined_links.destroy) {
1929 		list_del_init(&vma->combined_links.destroy);
1930 		xe_vma_destroy_unlocked(vma);
1931 	}
1932 
1933 	xe_svm_fini(vm);
1934 
1935 	up_write(&vm->lock);
1936 
1937 	down_write(&xe->usm.lock);
1938 	if (vm->usm.asid) {
1939 		void *lookup;
1940 
1941 		xe_assert(xe, xe->info.has_asid);
1942 		xe_assert(xe, !(vm->flags & XE_VM_FLAG_MIGRATION));
1943 
1944 		lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
1945 		xe_assert(xe, lookup == vm);
1946 	}
1947 	up_write(&xe->usm.lock);
1948 
1949 	xe_vm_clear_fault_entries(vm);
1950 
1951 	for_each_tile(tile, xe, id)
1952 		xe_range_fence_tree_fini(&vm->rftree[id]);
1953 
1954 	xe_vm_put(vm);
1955 }
1956 
1957 static void vm_destroy_work_func(struct work_struct *w)
1958 {
1959 	struct xe_vm *vm =
1960 		container_of(w, struct xe_vm, destroy_work);
1961 	struct xe_device *xe = vm->xe;
1962 	struct xe_tile *tile;
1963 	u8 id;
1964 
1965 	/* xe_vm_close_and_put was not called? */
1966 	xe_assert(xe, !vm->size);
1967 
1968 	if (xe_vm_in_preempt_fence_mode(vm))
1969 		flush_work(&vm->preempt.rebind_work);
1970 
1971 	mutex_destroy(&vm->snap_mutex);
1972 
1973 	if (vm->flags & XE_VM_FLAG_LR_MODE)
1974 		xe_pm_runtime_put(xe);
1975 
1976 	for_each_tile(tile, xe, id)
1977 		XE_WARN_ON(vm->pt_root[id]);
1978 
1979 	trace_xe_vm_free(vm);
1980 
1981 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
1982 
1983 	if (vm->xef)
1984 		xe_file_put(vm->xef);
1985 
1986 	kfree(vm);
1987 }
1988 
1989 static void xe_vm_free(struct drm_gpuvm *gpuvm)
1990 {
1991 	struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
1992 
1993 	/* To destroy the VM we need to be able to sleep */
1994 	queue_work(system_dfl_wq, &vm->destroy_work);
1995 }
1996 
1997 struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
1998 {
1999 	struct xe_vm *vm;
2000 
2001 	mutex_lock(&xef->vm.lock);
2002 	vm = xa_load(&xef->vm.xa, id);
2003 	if (vm)
2004 		xe_vm_get(vm);
2005 	mutex_unlock(&xef->vm.lock);
2006 
2007 	return vm;
2008 }
2009 
2010 u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
2011 {
2012 	return vm->pt_ops->pde_encode_bo(vm->pt_root[tile->id]->bo, 0);
2013 }
2014 
2015 static struct xe_exec_queue *
2016 to_wait_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
2017 {
2018 	return q ? q : vm->q[0];
2019 }
2020 
2021 static struct xe_user_fence *
2022 find_ufence_get(struct xe_sync_entry *syncs, u32 num_syncs)
2023 {
2024 	unsigned int i;
2025 
2026 	for (i = 0; i < num_syncs; i++) {
2027 		struct xe_sync_entry *e = &syncs[i];
2028 
2029 		if (xe_sync_is_ufence(e))
2030 			return xe_sync_ufence_get(e);
2031 	}
2032 
2033 	return NULL;
2034 }
2035 
2036 #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
2037 				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
2038 				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE | \
2039 				    DRM_XE_VM_CREATE_FLAG_NO_VM_OVERCOMMIT)
2040 
2041 int xe_vm_create_ioctl(struct drm_device *dev, void *data,
2042 		       struct drm_file *file)
2043 {
2044 	struct xe_device *xe = to_xe_device(dev);
2045 	struct xe_file *xef = to_xe_file(file);
2046 	struct drm_xe_vm_create *args = data;
2047 	struct xe_gt *wa_gt = xe_root_mmio_gt(xe);
2048 	struct xe_vm *vm;
2049 	u32 id;
2050 	int err;
2051 	u32 flags = 0;
2052 
2053 	if (XE_IOCTL_DBG(xe, args->extensions))
2054 		return -EINVAL;
2055 
2056 	if (wa_gt && XE_GT_WA(wa_gt, 22014953428))
2057 		args->flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE;
2058 
2059 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
2060 			 !xe->info.has_usm))
2061 		return -EINVAL;
2062 
2063 	if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2064 		return -EINVAL;
2065 
2066 	if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
2067 		return -EINVAL;
2068 
2069 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE &&
2070 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
2071 			 !xe->info.needs_scratch))
2072 		return -EINVAL;
2073 
2074 	if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE) &&
2075 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE))
2076 		return -EINVAL;
2077 
2078 	if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE) &&
2079 			 args->flags & DRM_XE_VM_CREATE_FLAG_NO_VM_OVERCOMMIT))
2080 		return -EINVAL;
2081 
2082 	if (args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE)
2083 		flags |= XE_VM_FLAG_SCRATCH_PAGE;
2084 	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
2085 		flags |= XE_VM_FLAG_LR_MODE;
2086 	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
2087 		flags |= XE_VM_FLAG_FAULT_MODE;
2088 	if (args->flags & DRM_XE_VM_CREATE_FLAG_NO_VM_OVERCOMMIT)
2089 		flags |= XE_VM_FLAG_NO_VM_OVERCOMMIT;
2090 
2091 	vm = xe_vm_create(xe, flags, xef);
2092 	if (IS_ERR(vm))
2093 		return PTR_ERR(vm);
2094 
2095 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
2096 	/* Warning: Security issue - never enable by default */
2097 	args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
2098 #endif
2099 
2100 	/* user id alloc must always be last in ioctl to prevent UAF */
2101 	err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
2102 	if (err)
2103 		goto err_close_and_put;
2104 
2105 	args->vm_id = id;
2106 
2107 	return 0;
2108 
2109 err_close_and_put:
2110 	xe_vm_close_and_put(vm);
2111 
2112 	return err;
2113 }
2114 
2115 int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
2116 			struct drm_file *file)
2117 {
2118 	struct xe_device *xe = to_xe_device(dev);
2119 	struct xe_file *xef = to_xe_file(file);
2120 	struct drm_xe_vm_destroy *args = data;
2121 	struct xe_vm *vm;
2122 	int err = 0;
2123 
2124 	if (XE_IOCTL_DBG(xe, args->pad) ||
2125 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2126 		return -EINVAL;
2127 
2128 	mutex_lock(&xef->vm.lock);
2129 	vm = xa_load(&xef->vm.xa, args->vm_id);
2130 	if (XE_IOCTL_DBG(xe, !vm))
2131 		err = -ENOENT;
2132 	else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
2133 		err = -EBUSY;
2134 	else
2135 		xa_erase(&xef->vm.xa, args->vm_id);
2136 	mutex_unlock(&xef->vm.lock);
2137 
2138 	if (!err)
2139 		xe_vm_close_and_put(vm);
2140 
2141 	return err;
2142 }
2143 
2144 static int xe_vm_query_vmas(struct xe_vm *vm, u64 start, u64 end)
2145 {
2146 	struct drm_gpuva *gpuva;
2147 	u32 num_vmas = 0;
2148 
2149 	lockdep_assert_held(&vm->lock);
2150 	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end)
2151 		num_vmas++;
2152 
2153 	return num_vmas;
2154 }
2155 
2156 static int get_mem_attrs(struct xe_vm *vm, u32 *num_vmas, u64 start,
2157 			 u64 end, struct drm_xe_mem_range_attr *attrs)
2158 {
2159 	struct drm_gpuva *gpuva;
2160 	int i = 0;
2161 
2162 	lockdep_assert_held(&vm->lock);
2163 
2164 	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end) {
2165 		struct xe_vma *vma = gpuva_to_vma(gpuva);
2166 
2167 		if (i == *num_vmas)
2168 			return -ENOSPC;
2169 
2170 		attrs[i].start = xe_vma_start(vma);
2171 		attrs[i].end = xe_vma_end(vma);
2172 		attrs[i].atomic.val = vma->attr.atomic_access;
2173 		attrs[i].pat_index.val = vma->attr.pat_index;
2174 		attrs[i].preferred_mem_loc.devmem_fd = vma->attr.preferred_loc.devmem_fd;
2175 		attrs[i].preferred_mem_loc.migration_policy =
2176 		vma->attr.preferred_loc.migration_policy;
2177 
2178 		i++;
2179 	}
2180 
2181 	*num_vmas = i;
2182 	return 0;
2183 }
2184 
2185 int xe_vm_query_vmas_attrs_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
2186 {
2187 	struct xe_device *xe = to_xe_device(dev);
2188 	struct xe_file *xef = to_xe_file(file);
2189 	struct drm_xe_mem_range_attr *mem_attrs;
2190 	struct drm_xe_vm_query_mem_range_attr *args = data;
2191 	u64 __user *attrs_user = u64_to_user_ptr(args->vector_of_mem_attr);
2192 	struct xe_vm *vm;
2193 	int err = 0;
2194 
2195 	if (XE_IOCTL_DBG(xe,
2196 			 ((args->num_mem_ranges == 0 &&
2197 			  (attrs_user || args->sizeof_mem_range_attr != 0)) ||
2198 			 (args->num_mem_ranges > 0 &&
2199 			  (!attrs_user ||
2200 			   args->sizeof_mem_range_attr !=
2201 			   sizeof(struct drm_xe_mem_range_attr))))))
2202 		return -EINVAL;
2203 
2204 	vm = xe_vm_lookup(xef, args->vm_id);
2205 	if (XE_IOCTL_DBG(xe, !vm))
2206 		return -EINVAL;
2207 
2208 	err = down_read_interruptible(&vm->lock);
2209 	if (err)
2210 		goto put_vm;
2211 
2212 	attrs_user = u64_to_user_ptr(args->vector_of_mem_attr);
2213 
2214 	if (args->num_mem_ranges == 0 && !attrs_user) {
2215 		args->num_mem_ranges = xe_vm_query_vmas(vm, args->start, args->start + args->range);
2216 		args->sizeof_mem_range_attr = sizeof(struct drm_xe_mem_range_attr);
2217 		goto unlock_vm;
2218 	}
2219 
2220 	mem_attrs = kvmalloc_array(args->num_mem_ranges, args->sizeof_mem_range_attr,
2221 				   GFP_KERNEL | __GFP_ACCOUNT |
2222 				   __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
2223 	if (!mem_attrs) {
2224 		err = args->num_mem_ranges > 1 ? -ENOBUFS : -ENOMEM;
2225 		goto unlock_vm;
2226 	}
2227 
2228 	memset(mem_attrs, 0, args->num_mem_ranges * args->sizeof_mem_range_attr);
2229 	err = get_mem_attrs(vm, &args->num_mem_ranges, args->start,
2230 			    args->start + args->range, mem_attrs);
2231 	if (err)
2232 		goto free_mem_attrs;
2233 
2234 	err = copy_to_user(attrs_user, mem_attrs,
2235 			   args->sizeof_mem_range_attr * args->num_mem_ranges);
2236 	if (err)
2237 		err = -EFAULT;
2238 
2239 free_mem_attrs:
2240 	kvfree(mem_attrs);
2241 unlock_vm:
2242 	up_read(&vm->lock);
2243 put_vm:
2244 	xe_vm_put(vm);
2245 	return err;
2246 }
2247 
2248 static bool vma_matches(struct xe_vma *vma, u64 page_addr)
2249 {
2250 	if (page_addr > xe_vma_end(vma) - 1 ||
2251 	    page_addr + SZ_4K - 1 < xe_vma_start(vma))
2252 		return false;
2253 
2254 	return true;
2255 }
2256 
2257 /**
2258  * xe_vm_find_vma_by_addr() - Find a VMA by its address
2259  *
2260  * @vm: the xe_vm the vma belongs to
2261  * @page_addr: address to look up
2262  */
2263 struct xe_vma *xe_vm_find_vma_by_addr(struct xe_vm *vm, u64 page_addr)
2264 {
2265 	struct xe_vma *vma = NULL;
2266 
2267 	if (vm->usm.last_fault_vma) {   /* Fast lookup */
2268 		if (vma_matches(vm->usm.last_fault_vma, page_addr))
2269 			vma = vm->usm.last_fault_vma;
2270 	}
2271 	if (!vma)
2272 		vma = xe_vm_find_overlapping_vma(vm, page_addr, SZ_4K);
2273 
2274 	return vma;
2275 }
2276 
2277 static const u32 region_to_mem_type[] = {
2278 	XE_PL_TT,
2279 	XE_PL_VRAM0,
2280 	XE_PL_VRAM1,
2281 };
2282 
2283 static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
2284 			     bool post_commit)
2285 {
2286 	xe_svm_notifier_lock(vm);
2287 	vma->gpuva.flags |= XE_VMA_DESTROYED;
2288 	xe_svm_notifier_unlock(vm);
2289 	if (post_commit)
2290 		xe_vm_remove_vma(vm, vma);
2291 }
2292 
2293 #undef ULL
2294 #define ULL	unsigned long long
2295 
2296 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
2297 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2298 {
2299 	struct xe_vma *vma;
2300 
2301 	switch (op->op) {
2302 	case DRM_GPUVA_OP_MAP:
2303 		vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
2304 		       (ULL)op->map.va.addr, (ULL)op->map.va.range);
2305 		break;
2306 	case DRM_GPUVA_OP_REMAP:
2307 		vma = gpuva_to_vma(op->remap.unmap->va);
2308 		vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2309 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2310 		       op->remap.unmap->keep ? 1 : 0);
2311 		if (op->remap.prev)
2312 			vm_dbg(&xe->drm,
2313 			       "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
2314 			       (ULL)op->remap.prev->va.addr,
2315 			       (ULL)op->remap.prev->va.range);
2316 		if (op->remap.next)
2317 			vm_dbg(&xe->drm,
2318 			       "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
2319 			       (ULL)op->remap.next->va.addr,
2320 			       (ULL)op->remap.next->va.range);
2321 		break;
2322 	case DRM_GPUVA_OP_UNMAP:
2323 		vma = gpuva_to_vma(op->unmap.va);
2324 		vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2325 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2326 		       op->unmap.keep ? 1 : 0);
2327 		break;
2328 	case DRM_GPUVA_OP_PREFETCH:
2329 		vma = gpuva_to_vma(op->prefetch.va);
2330 		vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
2331 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
2332 		break;
2333 	default:
2334 		drm_warn(&xe->drm, "NOT POSSIBLE\n");
2335 	}
2336 }
2337 #else
2338 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2339 {
2340 }
2341 #endif
2342 
2343 static bool __xe_vm_needs_clear_scratch_pages(struct xe_vm *vm, u32 bind_flags)
2344 {
2345 	if (!xe_vm_in_fault_mode(vm))
2346 		return false;
2347 
2348 	if (!xe_vm_has_scratch(vm))
2349 		return false;
2350 
2351 	if (bind_flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE)
2352 		return false;
2353 
2354 	return true;
2355 }
2356 
2357 static void xe_svm_prefetch_gpuva_ops_fini(struct drm_gpuva_ops *ops)
2358 {
2359 	struct drm_gpuva_op *__op;
2360 
2361 	drm_gpuva_for_each_op(__op, ops) {
2362 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2363 
2364 		xe_vma_svm_prefetch_op_fini(op);
2365 	}
2366 }
2367 
2368 /*
2369  * Create operations list from IOCTL arguments, setup operations fields so parse
2370  * and commit steps are decoupled from IOCTL arguments. This step can fail.
2371  */
2372 static struct drm_gpuva_ops *
2373 vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_vma_ops *vops,
2374 			 struct xe_bo *bo, u64 bo_offset_or_userptr,
2375 			 u64 addr, u64 range,
2376 			 u32 operation, u32 flags,
2377 			 u32 prefetch_region, u16 pat_index)
2378 {
2379 	struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
2380 	struct drm_gpuva_ops *ops;
2381 	struct drm_gpuva_op *__op;
2382 	struct drm_gpuvm_bo *vm_bo;
2383 	u64 range_start = addr;
2384 	u64 range_end = addr + range;
2385 	int err;
2386 
2387 	lockdep_assert_held_write(&vm->lock);
2388 
2389 	vm_dbg(&vm->xe->drm,
2390 	       "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
2391 	       operation, (ULL)addr, (ULL)range,
2392 	       (ULL)bo_offset_or_userptr);
2393 
2394 	switch (operation) {
2395 	case DRM_XE_VM_BIND_OP_MAP:
2396 		if (flags & DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR) {
2397 			xe_vm_find_cpu_addr_mirror_vma_range(vm, &range_start, &range_end);
2398 			vops->flags |= XE_VMA_OPS_FLAG_ALLOW_SVM_UNMAP;
2399 		}
2400 
2401 		fallthrough;
2402 	case DRM_XE_VM_BIND_OP_MAP_USERPTR: {
2403 		struct drm_gpuvm_map_req map_req = {
2404 			.map.va.addr = range_start,
2405 			.map.va.range = range_end - range_start,
2406 			.map.gem.obj = obj,
2407 			.map.gem.offset = bo_offset_or_userptr,
2408 		};
2409 
2410 		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, &map_req);
2411 		break;
2412 	}
2413 	case DRM_XE_VM_BIND_OP_UNMAP:
2414 		ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
2415 		break;
2416 	case DRM_XE_VM_BIND_OP_PREFETCH:
2417 		ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
2418 		break;
2419 	case DRM_XE_VM_BIND_OP_UNMAP_ALL:
2420 		xe_assert(vm->xe, bo);
2421 
2422 		err = xe_bo_lock(bo, true);
2423 		if (err)
2424 			return ERR_PTR(err);
2425 
2426 		vm_bo = drm_gpuvm_bo_obtain_locked(&vm->gpuvm, obj);
2427 		if (IS_ERR(vm_bo)) {
2428 			xe_bo_unlock(bo);
2429 			return ERR_CAST(vm_bo);
2430 		}
2431 
2432 		ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
2433 		drm_gpuvm_bo_put(vm_bo);
2434 		xe_bo_unlock(bo);
2435 		break;
2436 	default:
2437 		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
2438 		ops = ERR_PTR(-EINVAL);
2439 	}
2440 	if (IS_ERR(ops))
2441 		return ops;
2442 
2443 	drm_gpuva_for_each_op(__op, ops) {
2444 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2445 
2446 		if (__op->op == DRM_GPUVA_OP_MAP) {
2447 			op->map.immediate =
2448 				flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE;
2449 			if (flags & DRM_XE_VM_BIND_FLAG_READONLY)
2450 				op->map.vma_flags |= XE_VMA_READ_ONLY;
2451 			if (flags & DRM_XE_VM_BIND_FLAG_NULL)
2452 				op->map.vma_flags |= DRM_GPUVA_SPARSE;
2453 			if (flags & DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR)
2454 				op->map.vma_flags |= XE_VMA_SYSTEM_ALLOCATOR;
2455 			if (flags & DRM_XE_VM_BIND_FLAG_DUMPABLE)
2456 				op->map.vma_flags |= XE_VMA_DUMPABLE;
2457 			if (flags & DRM_XE_VM_BIND_FLAG_MADVISE_AUTORESET)
2458 				op->map.vma_flags |= XE_VMA_MADV_AUTORESET;
2459 			op->map.request_decompress = flags & DRM_XE_VM_BIND_FLAG_DECOMPRESS;
2460 			op->map.pat_index = pat_index;
2461 			op->map.invalidate_on_bind =
2462 				__xe_vm_needs_clear_scratch_pages(vm, flags);
2463 		} else if (__op->op == DRM_GPUVA_OP_PREFETCH) {
2464 			struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
2465 			struct xe_tile *tile;
2466 			struct xe_svm_range *svm_range;
2467 			struct drm_gpusvm_ctx ctx = {};
2468 			struct drm_pagemap *dpagemap = NULL;
2469 			u8 id, tile_mask = 0;
2470 			u32 i;
2471 
2472 			if (!xe_vma_is_cpu_addr_mirror(vma)) {
2473 				op->prefetch.region = prefetch_region;
2474 				break;
2475 			}
2476 
2477 			ctx.read_only = xe_vma_read_only(vma);
2478 			ctx.devmem_possible = IS_DGFX(vm->xe) &&
2479 					      IS_ENABLED(CONFIG_DRM_XE_PAGEMAP);
2480 
2481 			for_each_tile(tile, vm->xe, id)
2482 				tile_mask |= 0x1 << id;
2483 
2484 			xa_init_flags(&op->prefetch_range.range, XA_FLAGS_ALLOC);
2485 			op->prefetch_range.ranges_count = 0;
2486 
2487 			if (prefetch_region == DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC) {
2488 				dpagemap = xe_vma_resolve_pagemap(vma,
2489 								  xe_device_get_root_tile(vm->xe));
2490 			} else if (prefetch_region) {
2491 				tile = &vm->xe->tiles[region_to_mem_type[prefetch_region] -
2492 						      XE_PL_VRAM0];
2493 				dpagemap = xe_tile_local_pagemap(tile);
2494 			}
2495 
2496 			op->prefetch_range.dpagemap = dpagemap;
2497 alloc_next_range:
2498 			svm_range = xe_svm_range_find_or_insert(vm, addr, vma, &ctx);
2499 
2500 			if (PTR_ERR(svm_range) == -ENOENT) {
2501 				u64 ret = xe_svm_find_vma_start(vm, addr, range_end, vma);
2502 
2503 				addr = ret == ULONG_MAX ? 0 : ret;
2504 				if (addr)
2505 					goto alloc_next_range;
2506 				else
2507 					goto print_op_label;
2508 			}
2509 
2510 			if (IS_ERR(svm_range)) {
2511 				err = PTR_ERR(svm_range);
2512 				goto unwind_prefetch_ops;
2513 			}
2514 
2515 			if (xe_svm_range_validate(vm, svm_range, tile_mask, dpagemap)) {
2516 				xe_svm_range_debug(svm_range, "PREFETCH - RANGE IS VALID");
2517 				goto check_next_range;
2518 			}
2519 
2520 			err = xa_alloc(&op->prefetch_range.range,
2521 				       &i, svm_range, xa_limit_32b,
2522 				       GFP_KERNEL);
2523 
2524 			if (err)
2525 				goto unwind_prefetch_ops;
2526 
2527 			op->prefetch_range.ranges_count++;
2528 			vops->flags |= XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH;
2529 			xe_svm_range_debug(svm_range, "PREFETCH - RANGE CREATED");
2530 check_next_range:
2531 			if (range_end > xe_svm_range_end(svm_range) &&
2532 			    xe_svm_range_end(svm_range) < xe_vma_end(vma)) {
2533 				addr = xe_svm_range_end(svm_range);
2534 				goto alloc_next_range;
2535 			}
2536 		}
2537 print_op_label:
2538 		print_op(vm->xe, __op);
2539 	}
2540 
2541 	return ops;
2542 
2543 unwind_prefetch_ops:
2544 	xe_svm_prefetch_gpuva_ops_fini(ops);
2545 	drm_gpuva_ops_free(&vm->gpuvm, ops);
2546 	return ERR_PTR(err);
2547 }
2548 
2549 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_create, ERRNO);
2550 
2551 static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
2552 			      struct xe_vma_mem_attr *attr, unsigned int flags)
2553 {
2554 	struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2555 	struct xe_validation_ctx ctx;
2556 	struct drm_exec exec;
2557 	struct xe_vma *vma;
2558 	int err = 0;
2559 
2560 	lockdep_assert_held_write(&vm->lock);
2561 
2562 	if (bo) {
2563 		err = 0;
2564 		xe_validation_guard(&ctx, &vm->xe->val, &exec,
2565 				    (struct xe_val_flags) {.interruptible = true}, err) {
2566 			if (!bo->vm) {
2567 				err = drm_exec_lock_obj(&exec, xe_vm_obj(vm));
2568 				drm_exec_retry_on_contention(&exec);
2569 			}
2570 			if (!err) {
2571 				err = drm_exec_lock_obj(&exec, &bo->ttm.base);
2572 				drm_exec_retry_on_contention(&exec);
2573 			}
2574 			if (err)
2575 				return ERR_PTR(err);
2576 
2577 			vma = xe_vma_create(vm, bo, op->gem.offset,
2578 					    op->va.addr, op->va.addr +
2579 					    op->va.range - 1, attr, flags);
2580 			if (IS_ERR(vma))
2581 				return vma;
2582 
2583 			if (!bo->vm) {
2584 				err = add_preempt_fences(vm, bo);
2585 				if (err) {
2586 					prep_vma_destroy(vm, vma, false);
2587 					xe_vma_destroy(vma, NULL);
2588 				}
2589 			}
2590 		}
2591 		if (err)
2592 			return ERR_PTR(err);
2593 	} else {
2594 		vma = xe_vma_create(vm, NULL, op->gem.offset,
2595 				    op->va.addr, op->va.addr +
2596 				    op->va.range - 1, attr, flags);
2597 		if (IS_ERR(vma))
2598 			return vma;
2599 
2600 		if (xe_vma_is_userptr(vma)) {
2601 			err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2602 			/*
2603 			 * -EBUSY has dedicated meaning that a user fence
2604 			 * attached to the VMA is busy, in practice
2605 			 * xe_vma_userptr_pin_pages can only fail with -EBUSY if
2606 			 * we are low on memory so convert this to -ENOMEM.
2607 			 */
2608 			if (err == -EBUSY)
2609 				err = -ENOMEM;
2610 		}
2611 	}
2612 	if (err) {
2613 		prep_vma_destroy(vm, vma, false);
2614 		xe_vma_destroy_unlocked(vma);
2615 		vma = ERR_PTR(err);
2616 	}
2617 
2618 	return vma;
2619 }
2620 
2621 static u64 xe_vma_max_pte_size(struct xe_vma *vma)
2622 {
2623 	if (vma->gpuva.flags & XE_VMA_PTE_1G)
2624 		return SZ_1G;
2625 	else if (vma->gpuva.flags & (XE_VMA_PTE_2M | XE_VMA_PTE_COMPACT))
2626 		return SZ_2M;
2627 	else if (vma->gpuva.flags & XE_VMA_PTE_64K)
2628 		return SZ_64K;
2629 	else if (vma->gpuva.flags & XE_VMA_PTE_4K)
2630 		return SZ_4K;
2631 
2632 	return SZ_1G;	/* Uninitialized, used max size */
2633 }
2634 
2635 static void xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
2636 {
2637 	switch (size) {
2638 	case SZ_1G:
2639 		vma->gpuva.flags |= XE_VMA_PTE_1G;
2640 		break;
2641 	case SZ_2M:
2642 		vma->gpuva.flags |= XE_VMA_PTE_2M;
2643 		break;
2644 	case SZ_64K:
2645 		vma->gpuva.flags |= XE_VMA_PTE_64K;
2646 		break;
2647 	case SZ_4K:
2648 		vma->gpuva.flags |= XE_VMA_PTE_4K;
2649 		break;
2650 	}
2651 }
2652 
2653 static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
2654 {
2655 	int err = 0;
2656 
2657 	lockdep_assert_held_write(&vm->lock);
2658 
2659 	switch (op->base.op) {
2660 	case DRM_GPUVA_OP_MAP:
2661 		err |= xe_vm_insert_vma(vm, op->map.vma);
2662 		if (!err)
2663 			op->flags |= XE_VMA_OP_COMMITTED;
2664 		break;
2665 	case DRM_GPUVA_OP_REMAP:
2666 	{
2667 		u8 tile_present =
2668 			gpuva_to_vma(op->base.remap.unmap->va)->tile_present;
2669 
2670 		prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
2671 				 true);
2672 		op->flags |= XE_VMA_OP_COMMITTED;
2673 
2674 		if (op->remap.prev) {
2675 			err |= xe_vm_insert_vma(vm, op->remap.prev);
2676 			if (!err)
2677 				op->flags |= XE_VMA_OP_PREV_COMMITTED;
2678 			if (!err && op->remap.skip_prev) {
2679 				op->remap.prev->tile_present =
2680 					tile_present;
2681 			}
2682 		}
2683 		if (op->remap.next) {
2684 			err |= xe_vm_insert_vma(vm, op->remap.next);
2685 			if (!err)
2686 				op->flags |= XE_VMA_OP_NEXT_COMMITTED;
2687 			if (!err && op->remap.skip_next) {
2688 				op->remap.next->tile_present =
2689 					tile_present;
2690 			}
2691 		}
2692 
2693 		/*
2694 		 * Adjust for partial unbind after removing VMA from VM. In case
2695 		 * of unwind we might need to undo this later.
2696 		 */
2697 		if (!err) {
2698 			op->base.remap.unmap->va->va.addr = op->remap.start;
2699 			op->base.remap.unmap->va->va.range = op->remap.range;
2700 		}
2701 		break;
2702 	}
2703 	case DRM_GPUVA_OP_UNMAP:
2704 		prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
2705 		op->flags |= XE_VMA_OP_COMMITTED;
2706 		break;
2707 	case DRM_GPUVA_OP_PREFETCH:
2708 		op->flags |= XE_VMA_OP_COMMITTED;
2709 		break;
2710 	default:
2711 		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
2712 	}
2713 
2714 	return err;
2715 }
2716 
2717 /**
2718  * xe_vma_has_default_mem_attrs - Check if a VMA has default memory attributes
2719  * @vma: Pointer to the xe_vma structure to check
2720  *
2721  * This function determines whether the given VMA (Virtual Memory Area)
2722  * has its memory attributes set to their default values. Specifically,
2723  * it checks the following conditions:
2724  *
2725  * - `atomic_access` is `DRM_XE_VMA_ATOMIC_UNDEFINED`
2726  * - `pat_index` is equal to `default_pat_index`
2727  * - `preferred_loc.devmem_fd` is `DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE`
2728  * - `preferred_loc.migration_policy` is `DRM_XE_MIGRATE_ALL_PAGES`
2729  *
2730  * Return: true if all attributes are at their default values, false otherwise.
2731  */
2732 bool xe_vma_has_default_mem_attrs(struct xe_vma *vma)
2733 {
2734 	return (vma->attr.atomic_access == DRM_XE_ATOMIC_UNDEFINED &&
2735 		vma->attr.pat_index ==  vma->attr.default_pat_index &&
2736 		vma->attr.preferred_loc.devmem_fd == DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE &&
2737 		vma->attr.preferred_loc.migration_policy == DRM_XE_MIGRATE_ALL_PAGES);
2738 }
2739 
2740 static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
2741 				   struct xe_vma_ops *vops)
2742 {
2743 	struct xe_device *xe = vm->xe;
2744 	struct drm_gpuva_op *__op;
2745 	struct xe_tile *tile;
2746 	u8 id, tile_mask = 0;
2747 	int err = 0;
2748 
2749 	lockdep_assert_held_write(&vm->lock);
2750 
2751 	for_each_tile(tile, vm->xe, id)
2752 		tile_mask |= 0x1 << id;
2753 
2754 	drm_gpuva_for_each_op(__op, ops) {
2755 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2756 		struct xe_vma *vma;
2757 		unsigned int flags = 0;
2758 
2759 		INIT_LIST_HEAD(&op->link);
2760 		list_add_tail(&op->link, &vops->list);
2761 		op->tile_mask = tile_mask;
2762 
2763 		switch (op->base.op) {
2764 		case DRM_GPUVA_OP_MAP:
2765 		{
2766 			struct xe_vma_mem_attr default_attr = {
2767 				.preferred_loc = {
2768 					.devmem_fd = DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE,
2769 					.migration_policy = DRM_XE_MIGRATE_ALL_PAGES,
2770 				},
2771 				.atomic_access = DRM_XE_ATOMIC_UNDEFINED,
2772 				.default_pat_index = op->map.pat_index,
2773 				.pat_index = op->map.pat_index,
2774 				.purgeable_state = XE_MADV_PURGEABLE_WILLNEED,
2775 			};
2776 
2777 			flags |= op->map.vma_flags & XE_VMA_CREATE_MASK;
2778 
2779 			vma = new_vma(vm, &op->base.map, &default_attr,
2780 				      flags);
2781 			if (IS_ERR(vma))
2782 				return PTR_ERR(vma);
2783 
2784 			op->map.vma = vma;
2785 			if (((op->map.immediate || !xe_vm_in_fault_mode(vm)) &&
2786 			     !(op->map.vma_flags & XE_VMA_SYSTEM_ALLOCATOR)) ||
2787 			    op->map.invalidate_on_bind)
2788 				xe_vma_ops_incr_pt_update_ops(vops,
2789 							      op->tile_mask, 1);
2790 			break;
2791 		}
2792 		case DRM_GPUVA_OP_REMAP:
2793 		{
2794 			struct xe_vma *old =
2795 				gpuva_to_vma(op->base.remap.unmap->va);
2796 			bool skip = xe_vma_is_cpu_addr_mirror(old);
2797 			u64 start = xe_vma_start(old), end = xe_vma_end(old);
2798 			int num_remap_ops = 0;
2799 
2800 			if (op->base.remap.prev)
2801 				start = op->base.remap.prev->va.addr +
2802 					op->base.remap.prev->va.range;
2803 			if (op->base.remap.next)
2804 				end = op->base.remap.next->va.addr;
2805 
2806 			if (xe_vma_is_cpu_addr_mirror(old) &&
2807 			    xe_svm_has_mapping(vm, start, end)) {
2808 				if (vops->flags & XE_VMA_OPS_FLAG_MADVISE)
2809 					xe_svm_unmap_address_range(vm, start, end);
2810 				else
2811 					return -EBUSY;
2812 			}
2813 
2814 			op->remap.start = xe_vma_start(old);
2815 			op->remap.range = xe_vma_size(old);
2816 			op->remap.old_start = op->remap.start;
2817 			op->remap.old_range = op->remap.range;
2818 
2819 			flags |= op->base.remap.unmap->va->flags & XE_VMA_CREATE_MASK;
2820 			if (op->base.remap.prev) {
2821 				vma = new_vma(vm, op->base.remap.prev,
2822 					      &old->attr, flags);
2823 				if (IS_ERR(vma))
2824 					return PTR_ERR(vma);
2825 
2826 				op->remap.prev = vma;
2827 
2828 				/*
2829 				 * Userptr creates a new SG mapping so
2830 				 * we must also rebind.
2831 				 */
2832 				op->remap.skip_prev = skip ||
2833 					(!xe_vma_is_userptr(old) &&
2834 					IS_ALIGNED(xe_vma_end(vma),
2835 						   xe_vma_max_pte_size(old)));
2836 				if (op->remap.skip_prev) {
2837 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2838 					op->remap.range -=
2839 						xe_vma_end(vma) -
2840 						xe_vma_start(old);
2841 					op->remap.start = xe_vma_end(vma);
2842 					vm_dbg(&xe->drm, "REMAP:SKIP_PREV: addr=0x%016llx, range=0x%016llx",
2843 					       (ULL)op->remap.start,
2844 					       (ULL)op->remap.range);
2845 				} else {
2846 					num_remap_ops++;
2847 				}
2848 			}
2849 
2850 			if (op->base.remap.next) {
2851 				vma = new_vma(vm, op->base.remap.next,
2852 					      &old->attr, flags);
2853 				if (IS_ERR(vma))
2854 					return PTR_ERR(vma);
2855 
2856 				op->remap.next = vma;
2857 
2858 				/*
2859 				 * Userptr creates a new SG mapping so
2860 				 * we must also rebind.
2861 				 */
2862 				op->remap.skip_next = skip ||
2863 					(!xe_vma_is_userptr(old) &&
2864 					IS_ALIGNED(xe_vma_start(vma),
2865 						   xe_vma_max_pte_size(old)));
2866 				if (op->remap.skip_next) {
2867 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2868 					op->remap.range -=
2869 						xe_vma_end(old) -
2870 						xe_vma_start(vma);
2871 					vm_dbg(&xe->drm, "REMAP:SKIP_NEXT: addr=0x%016llx, range=0x%016llx",
2872 					       (ULL)op->remap.start,
2873 					       (ULL)op->remap.range);
2874 				} else {
2875 					num_remap_ops++;
2876 				}
2877 			}
2878 			if (!skip)
2879 				num_remap_ops++;
2880 
2881 			xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, num_remap_ops);
2882 			break;
2883 		}
2884 		case DRM_GPUVA_OP_UNMAP:
2885 			vma = gpuva_to_vma(op->base.unmap.va);
2886 
2887 			if (xe_vma_is_cpu_addr_mirror(vma) &&
2888 			    xe_svm_has_mapping(vm, xe_vma_start(vma),
2889 					       xe_vma_end(vma)) &&
2890 			    !(vops->flags & XE_VMA_OPS_FLAG_ALLOW_SVM_UNMAP))
2891 				return -EBUSY;
2892 
2893 			if (!xe_vma_is_cpu_addr_mirror(vma))
2894 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);
2895 			break;
2896 		case DRM_GPUVA_OP_PREFETCH:
2897 			vma = gpuva_to_vma(op->base.prefetch.va);
2898 
2899 			if (xe_vma_is_userptr(vma)) {
2900 				err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2901 				if (err)
2902 					return err;
2903 			}
2904 
2905 			if (xe_vma_is_cpu_addr_mirror(vma))
2906 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask,
2907 							      op->prefetch_range.ranges_count);
2908 			else
2909 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);
2910 
2911 			break;
2912 		default:
2913 			drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
2914 		}
2915 
2916 		err = xe_vma_op_commit(vm, op);
2917 		if (err)
2918 			return err;
2919 	}
2920 
2921 	return 0;
2922 }
2923 
2924 static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
2925 			     bool post_commit, bool prev_post_commit,
2926 			     bool next_post_commit)
2927 {
2928 	lockdep_assert_held_write(&vm->lock);
2929 
2930 	switch (op->base.op) {
2931 	case DRM_GPUVA_OP_MAP:
2932 		if (op->map.vma) {
2933 			prep_vma_destroy(vm, op->map.vma, post_commit);
2934 			xe_vma_destroy_unlocked(op->map.vma);
2935 		}
2936 		break;
2937 	case DRM_GPUVA_OP_UNMAP:
2938 	{
2939 		struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
2940 
2941 		if (vma) {
2942 			xe_svm_notifier_lock(vm);
2943 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2944 			xe_svm_notifier_unlock(vm);
2945 			if (post_commit)
2946 				xe_vm_insert_vma(vm, vma);
2947 		}
2948 		break;
2949 	}
2950 	case DRM_GPUVA_OP_REMAP:
2951 	{
2952 		struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
2953 
2954 		if (op->remap.prev) {
2955 			prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
2956 			xe_vma_destroy_unlocked(op->remap.prev);
2957 		}
2958 		if (op->remap.next) {
2959 			prep_vma_destroy(vm, op->remap.next, next_post_commit);
2960 			xe_vma_destroy_unlocked(op->remap.next);
2961 		}
2962 		if (vma) {
2963 			xe_svm_notifier_lock(vm);
2964 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2965 			xe_svm_notifier_unlock(vm);
2966 			if (post_commit) {
2967 				/*
2968 				 * Restore the old va range, in case of the
2969 				 * prev/next skip optimisation. Otherwise what
2970 				 * we re-insert here could be smaller than the
2971 				 * original range.
2972 				 */
2973 				op->base.remap.unmap->va->va.addr =
2974 					op->remap.old_start;
2975 				op->base.remap.unmap->va->va.range =
2976 					op->remap.old_range;
2977 				xe_vm_insert_vma(vm, vma);
2978 			}
2979 		}
2980 		break;
2981 	}
2982 	case DRM_GPUVA_OP_PREFETCH:
2983 		/* Nothing to do */
2984 		break;
2985 	default:
2986 		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
2987 	}
2988 }
2989 
2990 static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
2991 				     struct drm_gpuva_ops **ops,
2992 				     int num_ops_list)
2993 {
2994 	int i;
2995 
2996 	for (i = num_ops_list - 1; i >= 0; --i) {
2997 		struct drm_gpuva_ops *__ops = ops[i];
2998 		struct drm_gpuva_op *__op;
2999 
3000 		if (!__ops)
3001 			continue;
3002 
3003 		drm_gpuva_for_each_op_reverse(__op, __ops) {
3004 			struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
3005 
3006 			xe_vma_op_unwind(vm, op,
3007 					 op->flags & XE_VMA_OP_COMMITTED,
3008 					 op->flags & XE_VMA_OP_PREV_COMMITTED,
3009 					 op->flags & XE_VMA_OP_NEXT_COMMITTED);
3010 		}
3011 	}
3012 }
3013 
3014 /**
3015  * struct xe_vma_lock_and_validate_flags - Flags for vma_lock_and_validate()
3016  * @res_evict: Allow evicting resources during validation
3017  * @validate: Perform BO validation
3018  * @request_decompress: Request BO decompression
3019  * @check_purged: Reject operation if BO is purged
3020  */
3021 struct xe_vma_lock_and_validate_flags {
3022 	u32 res_evict : 1;
3023 	u32 validate : 1;
3024 	u32 request_decompress : 1;
3025 	u32 check_purged : 1;
3026 };
3027 
3028 static int vma_lock_and_validate(struct drm_exec *exec, struct xe_vma *vma,
3029 				 struct xe_vma_lock_and_validate_flags flags)
3030 {
3031 	struct xe_bo *bo = xe_vma_bo(vma);
3032 	struct xe_vm *vm = xe_vma_vm(vma);
3033 	int err = 0;
3034 
3035 	if (bo) {
3036 		if (!bo->vm)
3037 			err = drm_exec_lock_obj(exec, &bo->ttm.base);
3038 
3039 		/* Reject new mappings to DONTNEED/purged BOs; allow cleanup operations */
3040 		if (!err && flags.check_purged) {
3041 			if (xe_bo_madv_is_dontneed(bo))
3042 				err = -EBUSY;  /* BO marked purgeable */
3043 			else if (xe_bo_is_purged(bo))
3044 				err = -EINVAL; /* BO already purged */
3045 		}
3046 
3047 		if (!err && flags.validate)
3048 			err = xe_bo_validate(bo, vm,
3049 					     xe_vm_allow_vm_eviction(vm) &&
3050 					     flags.res_evict, exec);
3051 
3052 		if (err)
3053 			return err;
3054 
3055 		if (flags.request_decompress)
3056 			err = xe_bo_decompress(bo);
3057 	}
3058 
3059 	return err;
3060 }
3061 
3062 static int check_ufence(struct xe_vma *vma)
3063 {
3064 	if (vma->ufence) {
3065 		struct xe_user_fence * const f = vma->ufence;
3066 
3067 		if (!xe_sync_ufence_get_status(f))
3068 			return -EBUSY;
3069 
3070 		vma->ufence = NULL;
3071 		xe_sync_ufence_put(f);
3072 	}
3073 
3074 	return 0;
3075 }
3076 
3077 static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_op *op)
3078 {
3079 	bool devmem_possible = IS_DGFX(vm->xe) && IS_ENABLED(CONFIG_DRM_XE_PAGEMAP);
3080 	struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
3081 	struct drm_pagemap *dpagemap = op->prefetch_range.dpagemap;
3082 	int err = 0;
3083 
3084 	struct xe_svm_range *svm_range;
3085 	struct drm_gpusvm_ctx ctx = {};
3086 	unsigned long i;
3087 
3088 	if (!xe_vma_is_cpu_addr_mirror(vma))
3089 		return 0;
3090 
3091 	ctx.read_only = xe_vma_read_only(vma);
3092 	ctx.devmem_possible = devmem_possible;
3093 	ctx.check_pages_threshold = devmem_possible ? SZ_64K : 0;
3094 	ctx.device_private_page_owner = xe_svm_private_page_owner(vm, !dpagemap);
3095 
3096 	/* TODO: Threading the migration */
3097 	xa_for_each(&op->prefetch_range.range, i, svm_range) {
3098 		if (!dpagemap)
3099 			xe_svm_range_migrate_to_smem(vm, svm_range);
3100 
3101 		if (IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)) {
3102 			drm_dbg(&vm->xe->drm,
3103 				"Prefetch pagemap is %s start 0x%016lx end 0x%016lx\n",
3104 				dpagemap ? dpagemap->drm->unique : "system",
3105 				xe_svm_range_start(svm_range), xe_svm_range_end(svm_range));
3106 		}
3107 
3108 		if (xe_svm_range_needs_migrate_to_vram(svm_range, vma, dpagemap)) {
3109 			err = xe_svm_alloc_vram(svm_range, &ctx, dpagemap);
3110 			if (err) {
3111 				drm_dbg(&vm->xe->drm, "VRAM allocation failed, retry from userspace, asid=%u, gpusvm=%p, errno=%pe\n",
3112 					vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
3113 				return -ENODATA;
3114 			}
3115 			xe_svm_range_debug(svm_range, "PREFETCH - RANGE MIGRATED TO VRAM");
3116 		}
3117 
3118 		err = xe_svm_range_get_pages(vm, svm_range, &ctx);
3119 		if (err) {
3120 			drm_dbg(&vm->xe->drm, "Get pages failed, asid=%u, gpusvm=%p, errno=%pe\n",
3121 				vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
3122 			if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM)
3123 				err = -ENODATA;
3124 			return err;
3125 		}
3126 		xe_svm_range_debug(svm_range, "PREFETCH - RANGE GET PAGES DONE");
3127 	}
3128 
3129 	return err;
3130 }
3131 
3132 static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm,
3133 			    struct xe_vma_ops *vops, struct xe_vma_op *op)
3134 {
3135 	int err = 0;
3136 	bool res_evict;
3137 
3138 	/*
3139 	 * We only allow evicting a BO within the VM if it is not part of an
3140 	 * array of binds, as an array of binds can evict another BO within the
3141 	 * bind.
3142 	 */
3143 	res_evict = !(vops->flags & XE_VMA_OPS_ARRAY_OF_BINDS);
3144 
3145 	switch (op->base.op) {
3146 	case DRM_GPUVA_OP_MAP:
3147 		if (!op->map.invalidate_on_bind)
3148 			err = vma_lock_and_validate(exec, op->map.vma,
3149 						    (struct xe_vma_lock_and_validate_flags) {
3150 							.res_evict = res_evict,
3151 							.validate = !xe_vm_in_fault_mode(vm) ||
3152 								    op->map.immediate,
3153 							.request_decompress =
3154 							op->map.request_decompress,
3155 							.check_purged = true,
3156 						    });
3157 		break;
3158 	case DRM_GPUVA_OP_REMAP:
3159 		err = check_ufence(gpuva_to_vma(op->base.remap.unmap->va));
3160 		if (err)
3161 			break;
3162 
3163 		err = vma_lock_and_validate(exec,
3164 					    gpuva_to_vma(op->base.remap.unmap->va),
3165 					    (struct xe_vma_lock_and_validate_flags) {
3166 						    .res_evict = res_evict,
3167 						    .validate = false,
3168 						    .request_decompress = false,
3169 						    .check_purged = false,
3170 					    });
3171 		if (!err && op->remap.prev)
3172 			err = vma_lock_and_validate(exec, op->remap.prev,
3173 						    (struct xe_vma_lock_and_validate_flags) {
3174 							    .res_evict = res_evict,
3175 							    .validate = true,
3176 							    .request_decompress = false,
3177 							    .check_purged = true,
3178 						    });
3179 		if (!err && op->remap.next)
3180 			err = vma_lock_and_validate(exec, op->remap.next,
3181 						    (struct xe_vma_lock_and_validate_flags) {
3182 							    .res_evict = res_evict,
3183 							    .validate = true,
3184 							    .request_decompress = false,
3185 							    .check_purged = true,
3186 						    });
3187 		break;
3188 	case DRM_GPUVA_OP_UNMAP:
3189 		err = check_ufence(gpuva_to_vma(op->base.unmap.va));
3190 		if (err)
3191 			break;
3192 
3193 		err = vma_lock_and_validate(exec,
3194 					    gpuva_to_vma(op->base.unmap.va),
3195 					    (struct xe_vma_lock_and_validate_flags) {
3196 						    .res_evict = res_evict,
3197 						    .validate = false,
3198 						    .request_decompress = false,
3199 						    .check_purged = false,
3200 					    });
3201 		break;
3202 	case DRM_GPUVA_OP_PREFETCH:
3203 	{
3204 		struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
3205 		u32 region;
3206 
3207 		if (!xe_vma_is_cpu_addr_mirror(vma)) {
3208 			region = op->prefetch.region;
3209 			xe_assert(vm->xe, region == DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC ||
3210 				  region <= ARRAY_SIZE(region_to_mem_type));
3211 		}
3212 
3213 		/*
3214 		 * Prefetch attempts to migrate BO's backing store without
3215 		 * repopulating it first. Purged BOs have no backing store
3216 		 * to migrate, so reject the operation.
3217 		 */
3218 		err = vma_lock_and_validate(exec,
3219 					    gpuva_to_vma(op->base.prefetch.va),
3220 					    (struct xe_vma_lock_and_validate_flags) {
3221 						    .res_evict = res_evict,
3222 						    .validate = false,
3223 						    .request_decompress = false,
3224 						    .check_purged = true,
3225 					    });
3226 		if (!err && !xe_vma_has_no_bo(vma))
3227 			err = xe_bo_migrate(xe_vma_bo(vma),
3228 					    region_to_mem_type[region],
3229 					    NULL,
3230 					    exec);
3231 		break;
3232 	}
3233 	default:
3234 		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
3235 	}
3236 
3237 	return err;
3238 }
3239 
3240 static int vm_bind_ioctl_ops_prefetch_ranges(struct xe_vm *vm, struct xe_vma_ops *vops)
3241 {
3242 	struct xe_vma_op *op;
3243 	int err;
3244 
3245 	if (!(vops->flags & XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH))
3246 		return 0;
3247 
3248 	list_for_each_entry(op, &vops->list, link) {
3249 		if (op->base.op  == DRM_GPUVA_OP_PREFETCH) {
3250 			err = prefetch_ranges(vm, op);
3251 			if (err)
3252 				return err;
3253 		}
3254 	}
3255 
3256 	return 0;
3257 }
3258 
3259 static int vm_bind_ioctl_ops_lock_and_prep(struct drm_exec *exec,
3260 					   struct xe_vm *vm,
3261 					   struct xe_vma_ops *vops)
3262 {
3263 	struct xe_vma_op *op;
3264 	int err;
3265 
3266 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
3267 	if (err)
3268 		return err;
3269 
3270 	list_for_each_entry(op, &vops->list, link) {
3271 		err = op_lock_and_prep(exec, vm, vops, op);
3272 		if (err)
3273 			return err;
3274 	}
3275 
3276 #ifdef TEST_VM_OPS_ERROR
3277 	if (vops->inject_error &&
3278 	    vm->xe->vm_inject_error_position == FORCE_OP_ERROR_LOCK)
3279 		return -ENOSPC;
3280 #endif
3281 
3282 	return 0;
3283 }
3284 
3285 static void op_trace(struct xe_vma_op *op)
3286 {
3287 	switch (op->base.op) {
3288 	case DRM_GPUVA_OP_MAP:
3289 		trace_xe_vma_bind(op->map.vma);
3290 		break;
3291 	case DRM_GPUVA_OP_REMAP:
3292 		trace_xe_vma_unbind(gpuva_to_vma(op->base.remap.unmap->va));
3293 		if (op->remap.prev)
3294 			trace_xe_vma_bind(op->remap.prev);
3295 		if (op->remap.next)
3296 			trace_xe_vma_bind(op->remap.next);
3297 		break;
3298 	case DRM_GPUVA_OP_UNMAP:
3299 		trace_xe_vma_unbind(gpuva_to_vma(op->base.unmap.va));
3300 		break;
3301 	case DRM_GPUVA_OP_PREFETCH:
3302 		trace_xe_vma_bind(gpuva_to_vma(op->base.prefetch.va));
3303 		break;
3304 	case DRM_GPUVA_OP_DRIVER:
3305 		break;
3306 	default:
3307 		XE_WARN_ON("NOT POSSIBLE");
3308 	}
3309 }
3310 
3311 static void trace_xe_vm_ops_execute(struct xe_vma_ops *vops)
3312 {
3313 	struct xe_vma_op *op;
3314 
3315 	list_for_each_entry(op, &vops->list, link)
3316 		op_trace(op);
3317 }
3318 
3319 static int vm_ops_setup_tile_args(struct xe_vm *vm, struct xe_vma_ops *vops)
3320 {
3321 	struct xe_exec_queue *q = vops->q;
3322 	struct xe_tile *tile;
3323 	int number_tiles = 0;
3324 	u8 id;
3325 
3326 	for_each_tile(tile, vm->xe, id) {
3327 		if (vops->pt_update_ops[id].num_ops)
3328 			++number_tiles;
3329 
3330 		if (vops->pt_update_ops[id].q)
3331 			continue;
3332 
3333 		if (q) {
3334 			vops->pt_update_ops[id].q = q;
3335 			if (vm->pt_root[id] && !list_empty(&q->multi_gt_list))
3336 				q = list_next_entry(q, multi_gt_list);
3337 		} else {
3338 			vops->pt_update_ops[id].q = vm->q[id];
3339 		}
3340 	}
3341 
3342 	return number_tiles;
3343 }
3344 
3345 static struct dma_fence *ops_execute(struct xe_vm *vm,
3346 				     struct xe_vma_ops *vops)
3347 {
3348 	struct xe_tile *tile;
3349 	struct dma_fence *fence = NULL;
3350 	struct dma_fence **fences = NULL;
3351 	struct dma_fence_array *cf = NULL;
3352 	int number_tiles = 0, current_fence = 0, n_fence = 0, err, i;
3353 	u8 id;
3354 
3355 	number_tiles = vm_ops_setup_tile_args(vm, vops);
3356 	if (number_tiles == 0)
3357 		return ERR_PTR(-ENODATA);
3358 
3359 	for_each_tile(tile, vm->xe, id) {
3360 		++n_fence;
3361 
3362 		if (!(vops->flags & XE_VMA_OPS_FLAG_SKIP_TLB_WAIT))
3363 			for_each_tlb_inval(i)
3364 				++n_fence;
3365 	}
3366 
3367 	fences = kmalloc_objs(*fences, n_fence);
3368 	if (!fences) {
3369 		fence = ERR_PTR(-ENOMEM);
3370 		goto err_trace;
3371 	}
3372 
3373 	cf = dma_fence_array_alloc(n_fence);
3374 	if (!cf) {
3375 		fence = ERR_PTR(-ENOMEM);
3376 		goto err_out;
3377 	}
3378 
3379 	for_each_tile(tile, vm->xe, id) {
3380 		if (!vops->pt_update_ops[id].num_ops)
3381 			continue;
3382 
3383 		err = xe_pt_update_ops_prepare(tile, vops);
3384 		if (err) {
3385 			fence = ERR_PTR(err);
3386 			goto err_out;
3387 		}
3388 	}
3389 
3390 	trace_xe_vm_ops_execute(vops);
3391 
3392 	for_each_tile(tile, vm->xe, id) {
3393 		struct xe_exec_queue *q = vops->pt_update_ops[tile->id].q;
3394 
3395 		fence = NULL;
3396 		if (!vops->pt_update_ops[id].num_ops)
3397 			goto collect_fences;
3398 
3399 		fence = xe_pt_update_ops_run(tile, vops);
3400 		if (IS_ERR(fence))
3401 			goto err_out;
3402 
3403 collect_fences:
3404 		fences[current_fence++] = fence ?: dma_fence_get_stub();
3405 		if (vops->flags & XE_VMA_OPS_FLAG_SKIP_TLB_WAIT)
3406 			continue;
3407 
3408 		xe_migrate_job_lock(tile->migrate, q);
3409 		for_each_tlb_inval(i)
3410 			fences[current_fence++] =
3411 				xe_exec_queue_tlb_inval_last_fence_get(q, vm, i);
3412 		xe_migrate_job_unlock(tile->migrate, q);
3413 	}
3414 
3415 	xe_assert(vm->xe, current_fence == n_fence);
3416 	dma_fence_array_init(cf, n_fence, fences, dma_fence_context_alloc(1),
3417 			     1, false);
3418 	fence = &cf->base;
3419 
3420 	for_each_tile(tile, vm->xe, id) {
3421 		if (!vops->pt_update_ops[id].num_ops)
3422 			continue;
3423 
3424 		xe_pt_update_ops_fini(tile, vops);
3425 	}
3426 
3427 	return fence;
3428 
3429 err_out:
3430 	for_each_tile(tile, vm->xe, id) {
3431 		if (!vops->pt_update_ops[id].num_ops)
3432 			continue;
3433 
3434 		xe_pt_update_ops_abort(tile, vops);
3435 	}
3436 	while (current_fence)
3437 		dma_fence_put(fences[--current_fence]);
3438 	kfree(fences);
3439 	kfree(cf);
3440 
3441 err_trace:
3442 	trace_xe_vm_ops_fail(vm);
3443 	return fence;
3444 }
3445 
3446 static void vma_add_ufence(struct xe_vma *vma, struct xe_user_fence *ufence)
3447 {
3448 	if (vma->ufence)
3449 		xe_sync_ufence_put(vma->ufence);
3450 	vma->ufence = __xe_sync_ufence_get(ufence);
3451 }
3452 
3453 static void op_add_ufence(struct xe_vm *vm, struct xe_vma_op *op,
3454 			  struct xe_user_fence *ufence)
3455 {
3456 	switch (op->base.op) {
3457 	case DRM_GPUVA_OP_MAP:
3458 		if (!xe_vma_is_cpu_addr_mirror(op->map.vma))
3459 			vma_add_ufence(op->map.vma, ufence);
3460 		break;
3461 	case DRM_GPUVA_OP_REMAP:
3462 		if (op->remap.prev)
3463 			vma_add_ufence(op->remap.prev, ufence);
3464 		if (op->remap.next)
3465 			vma_add_ufence(op->remap.next, ufence);
3466 		break;
3467 	case DRM_GPUVA_OP_UNMAP:
3468 		break;
3469 	case DRM_GPUVA_OP_PREFETCH:
3470 		vma_add_ufence(gpuva_to_vma(op->base.prefetch.va), ufence);
3471 		break;
3472 	default:
3473 		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
3474 	}
3475 }
3476 
3477 static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
3478 				   struct dma_fence *fence)
3479 {
3480 	struct xe_user_fence *ufence;
3481 	struct xe_vma_op *op;
3482 	int i;
3483 
3484 	ufence = find_ufence_get(vops->syncs, vops->num_syncs);
3485 	list_for_each_entry(op, &vops->list, link) {
3486 		if (ufence)
3487 			op_add_ufence(vm, op, ufence);
3488 
3489 		if (op->base.op == DRM_GPUVA_OP_UNMAP)
3490 			xe_vma_destroy(gpuva_to_vma(op->base.unmap.va), fence);
3491 		else if (op->base.op == DRM_GPUVA_OP_REMAP)
3492 			xe_vma_destroy(gpuva_to_vma(op->base.remap.unmap->va),
3493 				       fence);
3494 	}
3495 	if (ufence)
3496 		xe_sync_ufence_put(ufence);
3497 	if (fence) {
3498 		for (i = 0; i < vops->num_syncs; i++)
3499 			xe_sync_entry_signal(vops->syncs + i, fence);
3500 	}
3501 }
3502 
3503 static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
3504 						   struct xe_vma_ops *vops)
3505 {
3506 	struct xe_validation_ctx ctx;
3507 	struct drm_exec exec;
3508 	struct dma_fence *fence;
3509 	int err = 0;
3510 
3511 	lockdep_assert_held_write(&vm->lock);
3512 
3513 	xe_validation_guard(&ctx, &vm->xe->val, &exec,
3514 			    ((struct xe_val_flags) {
3515 				    .interruptible = true,
3516 				    .exec_ignore_duplicates = true,
3517 			    }), err) {
3518 		err = vm_bind_ioctl_ops_lock_and_prep(&exec, vm, vops);
3519 		drm_exec_retry_on_contention(&exec);
3520 		xe_validation_retry_on_oom(&ctx, &err);
3521 		if (err)
3522 			return ERR_PTR(err);
3523 
3524 		xe_vm_set_validation_exec(vm, &exec);
3525 		fence = ops_execute(vm, vops);
3526 		xe_vm_set_validation_exec(vm, NULL);
3527 		if (IS_ERR(fence)) {
3528 			if (PTR_ERR(fence) == -ENODATA)
3529 				vm_bind_ioctl_ops_fini(vm, vops, NULL);
3530 			return fence;
3531 		}
3532 
3533 		vm_bind_ioctl_ops_fini(vm, vops, fence);
3534 	}
3535 
3536 	return err ? ERR_PTR(err) : fence;
3537 }
3538 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);
3539 
3540 #define SUPPORTED_FLAGS_STUB  \
3541 	(DRM_XE_VM_BIND_FLAG_READONLY | \
3542 	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
3543 	 DRM_XE_VM_BIND_FLAG_NULL | \
3544 	 DRM_XE_VM_BIND_FLAG_DUMPABLE | \
3545 	 DRM_XE_VM_BIND_FLAG_CHECK_PXP | \
3546 	 DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR | \
3547 	 DRM_XE_VM_BIND_FLAG_MADVISE_AUTORESET | \
3548 	 DRM_XE_VM_BIND_FLAG_DECOMPRESS)
3549 
3550 #ifdef TEST_VM_OPS_ERROR
3551 #define SUPPORTED_FLAGS	(SUPPORTED_FLAGS_STUB | FORCE_OP_ERROR)
3552 #else
3553 #define SUPPORTED_FLAGS	SUPPORTED_FLAGS_STUB
3554 #endif
3555 
3556 #define XE_64K_PAGE_MASK 0xffffull
3557 #define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
3558 
3559 static int vm_bind_ioctl_check_args(struct xe_device *xe, struct xe_vm *vm,
3560 				    struct drm_xe_vm_bind *args,
3561 				    struct drm_xe_vm_bind_op **bind_ops)
3562 {
3563 	int err;
3564 	int i;
3565 
3566 	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
3567 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
3568 		return -EINVAL;
3569 
3570 	if (XE_IOCTL_DBG(xe, args->extensions))
3571 		return -EINVAL;
3572 
3573 	if (XE_IOCTL_DBG(xe, args->num_syncs > DRM_XE_MAX_SYNCS))
3574 		return -EINVAL;
3575 
3576 	if (args->num_binds > 1) {
3577 		u64 __user *bind_user =
3578 			u64_to_user_ptr(args->vector_of_binds);
3579 
3580 		*bind_ops = kvmalloc_objs(struct drm_xe_vm_bind_op,
3581 					  args->num_binds,
3582 					  GFP_KERNEL | __GFP_ACCOUNT | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3583 		if (!*bind_ops)
3584 			return args->num_binds > 1 ? -ENOBUFS : -ENOMEM;
3585 
3586 		err = copy_from_user(*bind_ops, bind_user,
3587 				     sizeof(struct drm_xe_vm_bind_op) *
3588 				     args->num_binds);
3589 		if (XE_IOCTL_DBG(xe, err)) {
3590 			err = -EFAULT;
3591 			goto free_bind_ops;
3592 		}
3593 	} else {
3594 		*bind_ops = &args->bind;
3595 	}
3596 
3597 	for (i = 0; i < args->num_binds; ++i) {
3598 		u64 range = (*bind_ops)[i].range;
3599 		u64 addr = (*bind_ops)[i].addr;
3600 		u32 op = (*bind_ops)[i].op;
3601 		u32 flags = (*bind_ops)[i].flags;
3602 		u32 obj = (*bind_ops)[i].obj;
3603 		u64 obj_offset = (*bind_ops)[i].obj_offset;
3604 		u32 prefetch_region = (*bind_ops)[i].prefetch_mem_region_instance;
3605 		bool is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
3606 		bool is_cpu_addr_mirror = flags &
3607 			DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR;
3608 		bool is_decompress = flags & DRM_XE_VM_BIND_FLAG_DECOMPRESS;
3609 		u16 pat_index = (*bind_ops)[i].pat_index;
3610 		u16 coh_mode;
3611 		bool comp_en;
3612 
3613 		if (XE_IOCTL_DBG(xe, is_cpu_addr_mirror &&
3614 				 (!xe_vm_in_fault_mode(vm) ||
3615 				 !IS_ENABLED(CONFIG_DRM_XE_GPUSVM)))) {
3616 			err = -EINVAL;
3617 			goto free_bind_ops;
3618 		}
3619 
3620 		if (XE_IOCTL_DBG(xe, pat_index >= xe->pat.n_entries)) {
3621 			err = -EINVAL;
3622 			goto free_bind_ops;
3623 		}
3624 
3625 		pat_index = array_index_nospec(pat_index, xe->pat.n_entries);
3626 		(*bind_ops)[i].pat_index = pat_index;
3627 		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3628 		comp_en = xe_pat_index_get_comp_en(xe, pat_index);
3629 		if (XE_IOCTL_DBG(xe, !coh_mode)) { /* hw reserved */
3630 			err = -EINVAL;
3631 			goto free_bind_ops;
3632 		}
3633 
3634 		if (XE_WARN_ON(coh_mode > XE_COH_2WAY)) {
3635 			err = -EINVAL;
3636 			goto free_bind_ops;
3637 		}
3638 
3639 		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
3640 		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
3641 		    XE_IOCTL_DBG(xe, obj && (is_null || is_cpu_addr_mirror)) ||
3642 		    XE_IOCTL_DBG(xe, obj_offset && (is_null ||
3643 						    is_cpu_addr_mirror)) ||
3644 		    XE_IOCTL_DBG(xe, op != DRM_XE_VM_BIND_OP_MAP &&
3645 				 (is_decompress || is_null || is_cpu_addr_mirror)) ||
3646 		    XE_IOCTL_DBG(xe, is_decompress &&
3647 				 xe_pat_index_get_comp_en(xe, pat_index)) ||
3648 		    XE_IOCTL_DBG(xe, !obj &&
3649 				 op == DRM_XE_VM_BIND_OP_MAP &&
3650 				 !is_null && !is_cpu_addr_mirror) ||
3651 		    XE_IOCTL_DBG(xe, !obj &&
3652 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3653 		    XE_IOCTL_DBG(xe, addr &&
3654 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3655 		    XE_IOCTL_DBG(xe, range &&
3656 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3657 		    XE_IOCTL_DBG(xe, obj &&
3658 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3659 		    XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3660 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3661 		    XE_IOCTL_DBG(xe, xe_device_is_l2_flush_optimized(xe) &&
3662 				 (op == DRM_XE_VM_BIND_OP_MAP_USERPTR ||
3663 				  is_cpu_addr_mirror) &&
3664 				 (pat_index != 19 && coh_mode != XE_COH_2WAY)) ||
3665 		    XE_IOCTL_DBG(xe, comp_en &&
3666 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3667 		    XE_IOCTL_DBG(xe, op == DRM_XE_VM_BIND_OP_MAP_USERPTR &&
3668 				 !IS_ENABLED(CONFIG_DRM_GPUSVM)) ||
3669 		    XE_IOCTL_DBG(xe, obj &&
3670 				 op == DRM_XE_VM_BIND_OP_PREFETCH) ||
3671 		    XE_IOCTL_DBG(xe, prefetch_region &&
3672 				 op != DRM_XE_VM_BIND_OP_PREFETCH) ||
3673 		    XE_IOCTL_DBG(xe, (prefetch_region != DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC &&
3674 				      /* Guard against undefined shift in BIT(prefetch_region) */
3675 				      (prefetch_region >= (sizeof(xe->info.mem_region_mask) * 8) ||
3676 				      !(BIT(prefetch_region) & xe->info.mem_region_mask)))) ||
3677 		    XE_IOCTL_DBG(xe, obj &&
3678 				 op == DRM_XE_VM_BIND_OP_UNMAP) ||
3679 		    XE_IOCTL_DBG(xe, (flags & DRM_XE_VM_BIND_FLAG_MADVISE_AUTORESET) &&
3680 				 (!is_cpu_addr_mirror || op != DRM_XE_VM_BIND_OP_MAP))) {
3681 			err = -EINVAL;
3682 			goto free_bind_ops;
3683 		}
3684 
3685 		if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
3686 		    XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
3687 		    XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
3688 		    XE_IOCTL_DBG(xe, !range &&
3689 				 op != DRM_XE_VM_BIND_OP_UNMAP_ALL)) {
3690 			err = -EINVAL;
3691 			goto free_bind_ops;
3692 		}
3693 
3694 		if (is_decompress && (XE_IOCTL_DBG(xe, !xe_device_has_flat_ccs(xe)) ||
3695 				      XE_IOCTL_DBG(xe, GRAPHICS_VER(xe) < 20) ||
3696 				      XE_IOCTL_DBG(xe, !IS_DGFX(xe)))) {
3697 			err = -EOPNOTSUPP;
3698 			goto free_bind_ops;
3699 		}
3700 	}
3701 
3702 	return 0;
3703 
3704 free_bind_ops:
3705 	if (args->num_binds > 1)
3706 		kvfree(*bind_ops);
3707 	*bind_ops = NULL;
3708 	return err;
3709 }
3710 
3711 static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
3712 				       struct xe_exec_queue *q,
3713 				       struct xe_sync_entry *syncs,
3714 				       int num_syncs)
3715 {
3716 	struct dma_fence *fence = NULL;
3717 	int i, err = 0;
3718 
3719 	if (num_syncs) {
3720 		fence = xe_sync_in_fence_get(syncs, num_syncs,
3721 					     to_wait_exec_queue(vm, q), vm);
3722 		if (IS_ERR(fence))
3723 			return PTR_ERR(fence);
3724 
3725 		for (i = 0; i < num_syncs; i++)
3726 			xe_sync_entry_signal(&syncs[i], fence);
3727 	}
3728 
3729 	dma_fence_put(fence);
3730 
3731 	return err;
3732 }
3733 
3734 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
3735 			    struct xe_exec_queue *q,
3736 			    struct xe_sync_entry *syncs, u32 num_syncs)
3737 {
3738 	memset(vops, 0, sizeof(*vops));
3739 	INIT_LIST_HEAD(&vops->list);
3740 	vops->vm = vm;
3741 	vops->q = q;
3742 	vops->syncs = syncs;
3743 	vops->num_syncs = num_syncs;
3744 	vops->flags = 0;
3745 }
3746 
3747 static int xe_vm_bind_ioctl_validate_bo(struct xe_device *xe, struct xe_bo *bo,
3748 					u64 addr, u64 range, u64 obj_offset,
3749 					u16 pat_index, u32 op, u32 bind_flags)
3750 {
3751 	u16 coh_mode;
3752 	bool comp_en;
3753 
3754 	if (XE_IOCTL_DBG(xe, (bo->flags & XE_BO_FLAG_NO_COMPRESSION) &&
3755 			 xe_pat_index_get_comp_en(xe, pat_index)))
3756 		return -EINVAL;
3757 
3758 	if (XE_IOCTL_DBG(xe, range > xe_bo_size(bo)) ||
3759 	    XE_IOCTL_DBG(xe, obj_offset >
3760 			 xe_bo_size(bo) - range)) {
3761 		return -EINVAL;
3762 	}
3763 
3764 	/*
3765 	 * Some platforms require 64k VM_BIND alignment,
3766 	 * specifically those with XE_VRAM_FLAGS_NEED64K.
3767 	 *
3768 	 * Other platforms may have BO's set to 64k physical placement,
3769 	 * but can be mapped at 4k offsets anyway. This check is only
3770 	 * there for the former case.
3771 	 */
3772 	if ((bo->flags & XE_BO_FLAG_INTERNAL_64K) &&
3773 	    (xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)) {
3774 		if (XE_IOCTL_DBG(xe, obj_offset &
3775 				 XE_64K_PAGE_MASK) ||
3776 		    XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
3777 		    XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
3778 			return -EINVAL;
3779 		}
3780 	}
3781 
3782 	coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3783 	if (bo->cpu_caching) {
3784 		if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3785 				 bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)) {
3786 			return -EINVAL;
3787 		}
3788 	} else if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE)) {
3789 		/*
3790 		 * Imported dma-buf from a different device should
3791 		 * require 1way or 2way coherency since we don't know
3792 		 * how it was mapped on the CPU. Just assume is it
3793 		 * potentially cached on CPU side.
3794 		 */
3795 		return -EINVAL;
3796 	}
3797 
3798 	/*
3799 	 * Ensures that imported buffer objects (dma-bufs) are not mapped
3800 	 * with a PAT index that enables compression.
3801 	 */
3802 	comp_en = xe_pat_index_get_comp_en(xe, pat_index);
3803 	if (XE_IOCTL_DBG(xe, bo->ttm.base.import_attach && comp_en))
3804 		return -EINVAL;
3805 
3806 	if (XE_IOCTL_DBG(xe, bo->ttm.base.import_attach && xe_device_is_l2_flush_optimized(xe) &&
3807 			 (pat_index != 19 && coh_mode != XE_COH_2WAY)))
3808 		return -EINVAL;
3809 
3810 	/* If a BO is protected it can only be mapped if the key is still valid */
3811 	if ((bind_flags & DRM_XE_VM_BIND_FLAG_CHECK_PXP) && xe_bo_is_protected(bo) &&
3812 	    op != DRM_XE_VM_BIND_OP_UNMAP && op != DRM_XE_VM_BIND_OP_UNMAP_ALL)
3813 		if (XE_IOCTL_DBG(xe, xe_pxp_bo_key_check(xe->pxp, bo) != 0))
3814 			return -ENOEXEC;
3815 
3816 	return 0;
3817 }
3818 
3819 int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3820 {
3821 	struct xe_device *xe = to_xe_device(dev);
3822 	struct xe_file *xef = to_xe_file(file);
3823 	struct drm_xe_vm_bind *args = data;
3824 	struct drm_xe_sync __user *syncs_user;
3825 	struct xe_bo **bos = NULL;
3826 	struct drm_gpuva_ops **ops = NULL;
3827 	struct xe_vm *vm;
3828 	struct xe_exec_queue *q = NULL;
3829 	u32 num_syncs, num_ufence = 0;
3830 	struct xe_sync_entry *syncs = NULL;
3831 	struct drm_xe_vm_bind_op *bind_ops = NULL;
3832 	struct xe_vma_ops vops;
3833 	struct dma_fence *fence;
3834 	int err;
3835 	int i;
3836 
3837 	vm = xe_vm_lookup(xef, args->vm_id);
3838 	if (XE_IOCTL_DBG(xe, !vm))
3839 		return -EINVAL;
3840 
3841 	err = vm_bind_ioctl_check_args(xe, vm, args, &bind_ops);
3842 	if (err)
3843 		goto put_vm;
3844 
3845 	if (args->exec_queue_id) {
3846 		q = xe_exec_queue_lookup(xef, args->exec_queue_id);
3847 		if (XE_IOCTL_DBG(xe, !q)) {
3848 			err = -ENOENT;
3849 			goto free_bind_ops;
3850 		}
3851 
3852 		if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
3853 			err = -EINVAL;
3854 			goto put_exec_queue;
3855 		}
3856 	}
3857 
3858 	if (XE_IOCTL_DBG(xe, q && vm != q->user_vm)) {
3859 		err = -EINVAL;
3860 		goto put_exec_queue;
3861 	}
3862 
3863 	/* Ensure all UNMAPs visible */
3864 	xe_svm_flush(vm);
3865 
3866 	err = down_write_killable(&vm->lock);
3867 	if (err)
3868 		goto put_exec_queue;
3869 
3870 	if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
3871 		err = -ENOENT;
3872 		goto release_vm_lock;
3873 	}
3874 
3875 	for (i = 0; i < args->num_binds; ++i) {
3876 		u64 range = bind_ops[i].range;
3877 		u64 addr = bind_ops[i].addr;
3878 
3879 		if (XE_IOCTL_DBG(xe, range > vm->size) ||
3880 		    XE_IOCTL_DBG(xe, addr > vm->size - range)) {
3881 			err = -EINVAL;
3882 			goto release_vm_lock;
3883 		}
3884 	}
3885 
3886 	if (args->num_binds) {
3887 		bos = kvzalloc_objs(*bos, args->num_binds,
3888 				    GFP_KERNEL | __GFP_ACCOUNT | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3889 		if (!bos) {
3890 			err = -ENOMEM;
3891 			goto release_vm_lock;
3892 		}
3893 
3894 		ops = kvzalloc_objs(*ops, args->num_binds,
3895 				    GFP_KERNEL | __GFP_ACCOUNT | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3896 		if (!ops) {
3897 			err = -ENOMEM;
3898 			goto free_bos;
3899 		}
3900 	}
3901 
3902 	for (i = 0; i < args->num_binds; ++i) {
3903 		struct drm_gem_object *gem_obj;
3904 		u64 range = bind_ops[i].range;
3905 		u64 addr = bind_ops[i].addr;
3906 		u32 obj = bind_ops[i].obj;
3907 		u64 obj_offset = bind_ops[i].obj_offset;
3908 		u16 pat_index = bind_ops[i].pat_index;
3909 		u32 op = bind_ops[i].op;
3910 		u32 bind_flags = bind_ops[i].flags;
3911 
3912 		if (!obj)
3913 			continue;
3914 
3915 		gem_obj = drm_gem_object_lookup(file, obj);
3916 		if (XE_IOCTL_DBG(xe, !gem_obj)) {
3917 			err = -ENOENT;
3918 			goto put_obj;
3919 		}
3920 		bos[i] = gem_to_xe_bo(gem_obj);
3921 
3922 		err = xe_vm_bind_ioctl_validate_bo(xe, bos[i], addr, range,
3923 						   obj_offset, pat_index, op,
3924 						   bind_flags);
3925 		if (err)
3926 			goto put_obj;
3927 	}
3928 
3929 	if (args->num_syncs) {
3930 		syncs = kzalloc_objs(*syncs, args->num_syncs);
3931 		if (!syncs) {
3932 			err = -ENOMEM;
3933 			goto put_obj;
3934 		}
3935 	}
3936 
3937 	syncs_user = u64_to_user_ptr(args->syncs);
3938 	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
3939 		struct xe_exec_queue *__q = q ?: vm->q[0];
3940 
3941 		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
3942 					  &syncs_user[num_syncs],
3943 					  __q->ufence_syncobj,
3944 					  ++__q->ufence_timeline_value,
3945 					  (xe_vm_in_lr_mode(vm) ?
3946 					   SYNC_PARSE_FLAG_LR_MODE : 0) |
3947 					  (!args->num_binds ?
3948 					   SYNC_PARSE_FLAG_DISALLOW_USER_FENCE : 0));
3949 		if (err)
3950 			goto free_syncs;
3951 
3952 		if (xe_sync_is_ufence(&syncs[num_syncs]))
3953 			num_ufence++;
3954 	}
3955 
3956 	if (XE_IOCTL_DBG(xe, num_ufence > 1)) {
3957 		err = -EINVAL;
3958 		goto free_syncs;
3959 	}
3960 
3961 	if (!args->num_binds) {
3962 		err = -ENODATA;
3963 		goto free_syncs;
3964 	}
3965 
3966 	xe_vma_ops_init(&vops, vm, q, syncs, num_syncs);
3967 	if (args->num_binds > 1)
3968 		vops.flags |= XE_VMA_OPS_ARRAY_OF_BINDS;
3969 	for (i = 0; i < args->num_binds; ++i) {
3970 		u64 range = bind_ops[i].range;
3971 		u64 addr = bind_ops[i].addr;
3972 		u32 op = bind_ops[i].op;
3973 		u32 flags = bind_ops[i].flags;
3974 		u64 obj_offset = bind_ops[i].obj_offset;
3975 		u32 prefetch_region = bind_ops[i].prefetch_mem_region_instance;
3976 		u16 pat_index = bind_ops[i].pat_index;
3977 
3978 		ops[i] = vm_bind_ioctl_ops_create(vm, &vops, bos[i], obj_offset,
3979 						  addr, range, op, flags,
3980 						  prefetch_region, pat_index);
3981 		if (IS_ERR(ops[i])) {
3982 			err = PTR_ERR(ops[i]);
3983 			ops[i] = NULL;
3984 			goto unwind_ops;
3985 		}
3986 
3987 		err = vm_bind_ioctl_ops_parse(vm, ops[i], &vops);
3988 		if (err)
3989 			goto unwind_ops;
3990 
3991 #ifdef TEST_VM_OPS_ERROR
3992 		if (flags & FORCE_OP_ERROR) {
3993 			vops.inject_error = true;
3994 			vm->xe->vm_inject_error_position =
3995 				(vm->xe->vm_inject_error_position + 1) %
3996 				FORCE_OP_ERROR_COUNT;
3997 		}
3998 #endif
3999 	}
4000 
4001 	/* Nothing to do */
4002 	if (list_empty(&vops.list)) {
4003 		err = -ENODATA;
4004 		goto unwind_ops;
4005 	}
4006 
4007 	err = xe_vma_ops_alloc(&vops, args->num_binds > 1);
4008 	if (err)
4009 		goto unwind_ops;
4010 
4011 	err = vm_bind_ioctl_ops_prefetch_ranges(vm, &vops);
4012 	if (err)
4013 		goto unwind_ops;
4014 
4015 	fence = vm_bind_ioctl_ops_execute(vm, &vops);
4016 	if (IS_ERR(fence))
4017 		err = PTR_ERR(fence);
4018 	else
4019 		dma_fence_put(fence);
4020 
4021 unwind_ops:
4022 	if (err && err != -ENODATA)
4023 		vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
4024 	xe_vma_ops_fini(&vops);
4025 	for (i = args->num_binds - 1; i >= 0; --i)
4026 		if (ops[i])
4027 			drm_gpuva_ops_free(&vm->gpuvm, ops[i]);
4028 free_syncs:
4029 	if (err == -ENODATA)
4030 		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
4031 	while (num_syncs--)
4032 		xe_sync_entry_cleanup(&syncs[num_syncs]);
4033 
4034 	kfree(syncs);
4035 put_obj:
4036 	for (i = 0; i < args->num_binds; ++i)
4037 		xe_bo_put(bos[i]);
4038 
4039 	kvfree(ops);
4040 free_bos:
4041 	kvfree(bos);
4042 release_vm_lock:
4043 	up_write(&vm->lock);
4044 put_exec_queue:
4045 	if (q)
4046 		xe_exec_queue_put(q);
4047 free_bind_ops:
4048 	if (args->num_binds > 1)
4049 		kvfree(bind_ops);
4050 put_vm:
4051 	xe_vm_put(vm);
4052 	return err;
4053 }
4054 
4055 /*
4056  * Map access type, fault type, and fault level from current bspec
4057  * specification to user spec abstraction.  The current mapping is
4058  * approximately 1-to-1, with access type being the only notable
4059  * exception as it carries additional data with respect to prefetch
4060  * status that needs to be masked out.
4061  */
4062 static u8 xe_to_user_access_type(u8 access_type)
4063 {
4064 	return access_type & XE_PAGEFAULT_ACCESS_TYPE_MASK;
4065 }
4066 
4067 static u8 xe_to_user_fault_type(u8 fault_type)
4068 {
4069 	return fault_type;
4070 }
4071 
4072 static u8 xe_to_user_fault_level(u8 fault_level)
4073 {
4074 	return fault_level;
4075 }
4076 
4077 static int fill_faults(struct xe_vm *vm,
4078 		       struct drm_xe_vm_get_property *args)
4079 {
4080 	struct xe_vm_fault __user *usr_ptr = u64_to_user_ptr(args->data);
4081 	struct xe_vm_fault *fault_list, fault_entry = { 0 };
4082 	struct xe_vm_fault_entry *entry;
4083 	int ret = 0, i = 0, count, entry_size;
4084 
4085 	entry_size = sizeof(struct xe_vm_fault);
4086 	count = args->size / entry_size;
4087 
4088 	fault_list = kcalloc(count, sizeof(struct xe_vm_fault), GFP_KERNEL);
4089 	if (!fault_list)
4090 		return -ENOMEM;
4091 
4092 	spin_lock(&vm->faults.lock);
4093 	list_for_each_entry(entry, &vm->faults.list, list) {
4094 		if (i == count)
4095 			break;
4096 
4097 		fault_entry.address = xe_device_canonicalize_addr(vm->xe, entry->address);
4098 		fault_entry.address_precision = entry->address_precision;
4099 
4100 		fault_entry.access_type = xe_to_user_access_type(entry->access_type);
4101 		fault_entry.fault_type = xe_to_user_fault_type(entry->fault_type);
4102 		fault_entry.fault_level = xe_to_user_fault_level(entry->fault_level);
4103 
4104 		memcpy(&fault_list[i], &fault_entry, entry_size);
4105 
4106 		i++;
4107 	}
4108 	spin_unlock(&vm->faults.lock);
4109 
4110 	ret = copy_to_user(usr_ptr, fault_list, args->size);
4111 
4112 	kfree(fault_list);
4113 	return ret ? -EFAULT : 0;
4114 }
4115 
4116 static int xe_vm_get_property_helper(struct xe_vm *vm,
4117 				     struct drm_xe_vm_get_property *args)
4118 {
4119 	size_t size;
4120 
4121 	switch (args->property) {
4122 	case DRM_XE_VM_GET_PROPERTY_FAULTS:
4123 		spin_lock(&vm->faults.lock);
4124 		size = size_mul(sizeof(struct xe_vm_fault), vm->faults.len);
4125 		spin_unlock(&vm->faults.lock);
4126 
4127 		if (!args->size) {
4128 			args->size = size;
4129 			return 0;
4130 		}
4131 
4132 		/*
4133 		 * Number of faults may increase between calls to
4134 		 * xe_vm_get_property_ioctl, so just report the number of
4135 		 * faults the user requests if it's less than or equal to
4136 		 * the number of faults in the VM fault array.
4137 		 *
4138 		 * We should also at least assert that the args->size value
4139 		 * is a multiple of the xe_vm_fault struct size.
4140 		 */
4141 		if (args->size > size || args->size % sizeof(struct xe_vm_fault))
4142 			return -EINVAL;
4143 
4144 		return fill_faults(vm, args);
4145 	}
4146 	return -EINVAL;
4147 }
4148 
4149 int xe_vm_get_property_ioctl(struct drm_device *drm, void *data,
4150 			     struct drm_file *file)
4151 {
4152 	struct xe_device *xe = to_xe_device(drm);
4153 	struct xe_file *xef = to_xe_file(file);
4154 	struct drm_xe_vm_get_property *args = data;
4155 	struct xe_vm *vm;
4156 	int ret = 0;
4157 
4158 	if (XE_IOCTL_DBG(xe, (args->reserved[0] || args->reserved[1] ||
4159 			      args->reserved[2])))
4160 		return -EINVAL;
4161 
4162 	vm = xe_vm_lookup(xef, args->vm_id);
4163 	if (XE_IOCTL_DBG(xe, !vm))
4164 		return -ENOENT;
4165 
4166 	ret = xe_vm_get_property_helper(vm, args);
4167 
4168 	xe_vm_put(vm);
4169 	return ret;
4170 }
4171 
4172 /**
4173  * xe_vm_bind_kernel_bo - bind a kernel BO to a VM
4174  * @vm: VM to bind the BO to
4175  * @bo: BO to bind
4176  * @q: exec queue to use for the bind (optional)
4177  * @addr: address at which to bind the BO
4178  * @cache_lvl: PAT cache level to use
4179  *
4180  * Execute a VM bind map operation on a kernel-owned BO to bind it into a
4181  * kernel-owned VM.
4182  *
4183  * Returns a dma_fence to track the binding completion if the job to do so was
4184  * successfully submitted, an error pointer otherwise.
4185  */
4186 struct dma_fence *xe_vm_bind_kernel_bo(struct xe_vm *vm, struct xe_bo *bo,
4187 				       struct xe_exec_queue *q, u64 addr,
4188 				       enum xe_cache_level cache_lvl)
4189 {
4190 	struct xe_vma_ops vops;
4191 	struct drm_gpuva_ops *ops = NULL;
4192 	struct dma_fence *fence;
4193 	int err;
4194 
4195 	xe_bo_get(bo);
4196 	xe_vm_get(vm);
4197 	if (q)
4198 		xe_exec_queue_get(q);
4199 
4200 	down_write(&vm->lock);
4201 
4202 	xe_vma_ops_init(&vops, vm, q, NULL, 0);
4203 
4204 	ops = vm_bind_ioctl_ops_create(vm, &vops, bo, 0, addr, xe_bo_size(bo),
4205 				       DRM_XE_VM_BIND_OP_MAP, 0, 0,
4206 				       vm->xe->pat.idx[cache_lvl]);
4207 	if (IS_ERR(ops)) {
4208 		err = PTR_ERR(ops);
4209 		goto release_vm_lock;
4210 	}
4211 
4212 	err = vm_bind_ioctl_ops_parse(vm, ops, &vops);
4213 	if (err)
4214 		goto release_vm_lock;
4215 
4216 	xe_assert(vm->xe, !list_empty(&vops.list));
4217 
4218 	err = xe_vma_ops_alloc(&vops, false);
4219 	if (err)
4220 		goto unwind_ops;
4221 
4222 	fence = vm_bind_ioctl_ops_execute(vm, &vops);
4223 	if (IS_ERR(fence))
4224 		err = PTR_ERR(fence);
4225 
4226 unwind_ops:
4227 	if (err && err != -ENODATA)
4228 		vm_bind_ioctl_ops_unwind(vm, &ops, 1);
4229 
4230 	xe_vma_ops_fini(&vops);
4231 	drm_gpuva_ops_free(&vm->gpuvm, ops);
4232 
4233 release_vm_lock:
4234 	up_write(&vm->lock);
4235 
4236 	if (q)
4237 		xe_exec_queue_put(q);
4238 	xe_vm_put(vm);
4239 	xe_bo_put(bo);
4240 
4241 	if (err)
4242 		fence = ERR_PTR(err);
4243 
4244 	return fence;
4245 }
4246 
4247 /**
4248  * xe_vm_lock() - Lock the vm's dma_resv object
4249  * @vm: The struct xe_vm whose lock is to be locked
4250  * @intr: Whether to perform any wait interruptible
4251  *
4252  * Return: 0 on success, -EINTR if @intr is true and the wait for a
4253  * contended lock was interrupted. If @intr is false, the function
4254  * always returns 0.
4255  */
4256 int xe_vm_lock(struct xe_vm *vm, bool intr)
4257 {
4258 	int ret;
4259 
4260 	if (intr)
4261 		ret = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
4262 	else
4263 		ret = dma_resv_lock(xe_vm_resv(vm), NULL);
4264 
4265 	return ret;
4266 }
4267 
4268 /**
4269  * xe_vm_unlock() - Unlock the vm's dma_resv object
4270  * @vm: The struct xe_vm whose lock is to be released.
4271  *
4272  * Unlock a buffer object lock that was locked by xe_vm_lock().
4273  */
4274 void xe_vm_unlock(struct xe_vm *vm)
4275 {
4276 	dma_resv_unlock(xe_vm_resv(vm));
4277 }
4278 
4279 /**
4280  * xe_vm_invalidate_vma_submit - Submit a job to invalidate GPU mappings for
4281  * VMA.
4282  * @vma: VMA to invalidate
4283  * @batch: TLB invalidation batch to populate; caller must later call
4284  *         xe_tlb_inval_batch_wait() on it to wait for completion
4285  *
4286  * Walks a list of page tables leaves which it memset the entries owned by this
4287  * VMA to zero, invalidates the TLBs, but doesn't block waiting for TLB flush
4288  * to complete, but instead populates @batch which can be waited on using
4289  * xe_tlb_inval_batch_wait().
4290  *
4291  * Returns 0 for success, negative error code otherwise.
4292  */
4293 int xe_vm_invalidate_vma_submit(struct xe_vma *vma, struct xe_tlb_inval_batch *batch)
4294 {
4295 	struct xe_device *xe = xe_vma_vm(vma)->xe;
4296 	struct xe_vm *vm = xe_vma_vm(vma);
4297 	struct xe_tile *tile;
4298 	u8 tile_mask = 0;
4299 	int ret = 0;
4300 	u8 id;
4301 
4302 	xe_assert(xe, !xe_vma_is_null(vma));
4303 	xe_assert(xe, !xe_vma_is_cpu_addr_mirror(vma));
4304 	trace_xe_vma_invalidate(vma);
4305 
4306 	vm_dbg(&vm->xe->drm,
4307 	       "INVALIDATE: addr=0x%016llx, range=0x%016llx",
4308 		xe_vma_start(vma), xe_vma_size(vma));
4309 
4310 	/*
4311 	 * Check that we don't race with page-table updates, tile_invalidated
4312 	 * update is safe
4313 	 */
4314 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
4315 		if (xe_vma_is_userptr(vma)) {
4316 			lockdep_assert(lockdep_is_held_type(&vm->svm.gpusvm.notifier_lock, 0) ||
4317 				       (lockdep_is_held_type(&vm->svm.gpusvm.notifier_lock, 1) &&
4318 					lockdep_is_held(&xe_vm_resv(vm)->lock.base)));
4319 
4320 			WARN_ON_ONCE(!mmu_interval_check_retry
4321 				     (&to_userptr_vma(vma)->userptr.notifier,
4322 				      to_userptr_vma(vma)->userptr.pages.notifier_seq));
4323 			WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(vm),
4324 							     DMA_RESV_USAGE_BOOKKEEP));
4325 
4326 		} else {
4327 			xe_bo_assert_held(xe_vma_bo(vma));
4328 		}
4329 	}
4330 
4331 	for_each_tile(tile, xe, id)
4332 		if (xe_pt_zap_ptes(tile, vma))
4333 			tile_mask |= BIT(id);
4334 
4335 	xe_device_wmb(xe);
4336 
4337 	ret = xe_tlb_inval_range_tilemask_submit(xe, xe_vma_vm(vma)->usm.asid,
4338 						 xe_vma_start(vma), xe_vma_end(vma),
4339 						 tile_mask, batch);
4340 
4341 	/* WRITE_ONCE pairs with READ_ONCE in xe_vm_has_valid_gpu_mapping() */
4342 	WRITE_ONCE(vma->tile_invalidated, vma->tile_mask);
4343 	return ret;
4344 }
4345 
4346 /**
4347  * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
4348  * @vma: VMA to invalidate
4349  *
4350  * Walks a list of page tables leaves which it memset the entries owned by this
4351  * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
4352  * complete.
4353  *
4354  * Returns 0 for success, negative error code otherwise.
4355  */
4356 int xe_vm_invalidate_vma(struct xe_vma *vma)
4357 {
4358 	struct xe_tlb_inval_batch batch;
4359 	int ret;
4360 
4361 	ret = xe_vm_invalidate_vma_submit(vma, &batch);
4362 	if (ret)
4363 		return ret;
4364 
4365 	xe_tlb_inval_batch_wait(&batch);
4366 	return ret;
4367 }
4368 
4369 int xe_vm_validate_protected(struct xe_vm *vm)
4370 {
4371 	struct drm_gpuva *gpuva;
4372 	int err = 0;
4373 
4374 	if (!vm)
4375 		return -ENODEV;
4376 
4377 	mutex_lock(&vm->snap_mutex);
4378 
4379 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4380 		struct xe_vma *vma = gpuva_to_vma(gpuva);
4381 		struct xe_bo *bo = vma->gpuva.gem.obj ?
4382 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
4383 
4384 		if (!bo)
4385 			continue;
4386 
4387 		if (xe_bo_is_protected(bo)) {
4388 			err = xe_pxp_bo_key_check(vm->xe->pxp, bo);
4389 			if (err)
4390 				break;
4391 		}
4392 	}
4393 
4394 	mutex_unlock(&vm->snap_mutex);
4395 	return err;
4396 }
4397 
4398 struct xe_vm_snapshot {
4399 	int uapi_flags;
4400 	unsigned long num_snaps;
4401 	struct {
4402 		u64 ofs, bo_ofs;
4403 		unsigned long len;
4404 #define XE_VM_SNAP_FLAG_USERPTR		BIT(0)
4405 #define XE_VM_SNAP_FLAG_READ_ONLY	BIT(1)
4406 #define XE_VM_SNAP_FLAG_IS_NULL		BIT(2)
4407 		unsigned long flags;
4408 		int uapi_mem_region;
4409 		int pat_index;
4410 		int cpu_caching;
4411 		struct xe_bo *bo;
4412 		void *data;
4413 		struct mm_struct *mm;
4414 	} snap[];
4415 };
4416 
4417 struct xe_vm_snapshot *xe_vm_snapshot_capture(struct xe_vm *vm)
4418 {
4419 	unsigned long num_snaps = 0, i;
4420 	struct xe_vm_snapshot *snap = NULL;
4421 	struct drm_gpuva *gpuva;
4422 
4423 	if (!vm)
4424 		return NULL;
4425 
4426 	mutex_lock(&vm->snap_mutex);
4427 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4428 		if (gpuva->flags & XE_VMA_DUMPABLE)
4429 			num_snaps++;
4430 	}
4431 
4432 	if (num_snaps)
4433 		snap = kvzalloc(offsetof(struct xe_vm_snapshot, snap[num_snaps]), GFP_NOWAIT);
4434 	if (!snap) {
4435 		snap = num_snaps ? ERR_PTR(-ENOMEM) : ERR_PTR(-ENODEV);
4436 		goto out_unlock;
4437 	}
4438 
4439 	if (vm->flags & XE_VM_FLAG_FAULT_MODE)
4440 		snap->uapi_flags |= DRM_XE_VM_CREATE_FLAG_FAULT_MODE;
4441 	if (vm->flags & XE_VM_FLAG_LR_MODE)
4442 		snap->uapi_flags |= DRM_XE_VM_CREATE_FLAG_LR_MODE;
4443 	if (vm->flags & XE_VM_FLAG_SCRATCH_PAGE)
4444 		snap->uapi_flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE;
4445 
4446 	snap->num_snaps = num_snaps;
4447 	i = 0;
4448 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4449 		struct xe_vma *vma = gpuva_to_vma(gpuva);
4450 		struct xe_bo *bo = vma->gpuva.gem.obj ?
4451 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
4452 
4453 		if (!(gpuva->flags & XE_VMA_DUMPABLE))
4454 			continue;
4455 
4456 		snap->snap[i].ofs = xe_vma_start(vma);
4457 		snap->snap[i].len = xe_vma_size(vma);
4458 		snap->snap[i].flags = xe_vma_read_only(vma) ?
4459 			XE_VM_SNAP_FLAG_READ_ONLY : 0;
4460 		snap->snap[i].pat_index = vma->attr.pat_index;
4461 		if (bo) {
4462 			snap->snap[i].cpu_caching = bo->cpu_caching;
4463 			snap->snap[i].bo = xe_bo_get(bo);
4464 			snap->snap[i].bo_ofs = xe_vma_bo_offset(vma);
4465 			switch (bo->ttm.resource->mem_type) {
4466 			case XE_PL_SYSTEM:
4467 			case XE_PL_TT:
4468 				snap->snap[i].uapi_mem_region = 0;
4469 				break;
4470 			case XE_PL_VRAM0:
4471 				snap->snap[i].uapi_mem_region = 1;
4472 				break;
4473 			case XE_PL_VRAM1:
4474 				snap->snap[i].uapi_mem_region = 2;
4475 				break;
4476 			}
4477 		} else if (xe_vma_is_userptr(vma)) {
4478 			struct mm_struct *mm =
4479 				to_userptr_vma(vma)->userptr.notifier.mm;
4480 
4481 			if (mmget_not_zero(mm))
4482 				snap->snap[i].mm = mm;
4483 			else
4484 				snap->snap[i].data = ERR_PTR(-EFAULT);
4485 
4486 			snap->snap[i].bo_ofs = xe_vma_userptr(vma);
4487 			snap->snap[i].flags |= XE_VM_SNAP_FLAG_USERPTR;
4488 			snap->snap[i].uapi_mem_region = 0;
4489 		} else if (xe_vma_is_null(vma)) {
4490 			snap->snap[i].flags |= XE_VM_SNAP_FLAG_IS_NULL;
4491 			snap->snap[i].uapi_mem_region = -1;
4492 		} else {
4493 			snap->snap[i].data = ERR_PTR(-ENOENT);
4494 			snap->snap[i].uapi_mem_region = -1;
4495 		}
4496 		i++;
4497 	}
4498 
4499 out_unlock:
4500 	mutex_unlock(&vm->snap_mutex);
4501 	return snap;
4502 }
4503 
4504 void xe_vm_snapshot_capture_delayed(struct xe_vm_snapshot *snap)
4505 {
4506 	if (IS_ERR_OR_NULL(snap))
4507 		return;
4508 
4509 	for (int i = 0; i < snap->num_snaps; i++) {
4510 		struct xe_bo *bo = snap->snap[i].bo;
4511 		int err;
4512 
4513 		if (IS_ERR(snap->snap[i].data) ||
4514 		    snap->snap[i].flags & XE_VM_SNAP_FLAG_IS_NULL)
4515 			continue;
4516 
4517 		snap->snap[i].data = kvmalloc(snap->snap[i].len, GFP_USER);
4518 		if (!snap->snap[i].data) {
4519 			snap->snap[i].data = ERR_PTR(-ENOMEM);
4520 			goto cleanup_bo;
4521 		}
4522 
4523 		if (bo) {
4524 			err = xe_bo_read(bo, snap->snap[i].bo_ofs,
4525 					 snap->snap[i].data, snap->snap[i].len);
4526 		} else {
4527 			void __user *userptr = (void __user *)(size_t)snap->snap[i].bo_ofs;
4528 
4529 			kthread_use_mm(snap->snap[i].mm);
4530 			if (!copy_from_user(snap->snap[i].data, userptr, snap->snap[i].len))
4531 				err = 0;
4532 			else
4533 				err = -EFAULT;
4534 			kthread_unuse_mm(snap->snap[i].mm);
4535 
4536 			mmput(snap->snap[i].mm);
4537 			snap->snap[i].mm = NULL;
4538 		}
4539 
4540 		if (err) {
4541 			kvfree(snap->snap[i].data);
4542 			snap->snap[i].data = ERR_PTR(err);
4543 		}
4544 
4545 cleanup_bo:
4546 		xe_bo_put(bo);
4547 		snap->snap[i].bo = NULL;
4548 	}
4549 }
4550 
4551 void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p)
4552 {
4553 	unsigned long i, j;
4554 
4555 	if (IS_ERR_OR_NULL(snap)) {
4556 		drm_printf(p, "[0].error: %li\n", PTR_ERR(snap));
4557 		return;
4558 	}
4559 
4560 	drm_printf(p, "VM.uapi_flags: 0x%x\n", snap->uapi_flags);
4561 	for (i = 0; i < snap->num_snaps; i++) {
4562 		drm_printf(p, "[%llx].length: 0x%lx\n", snap->snap[i].ofs, snap->snap[i].len);
4563 
4564 		drm_printf(p, "[%llx].properties: %s|%s|mem_region=0x%lx|pat_index=%d|cpu_caching=%d\n",
4565 			   snap->snap[i].ofs,
4566 			   snap->snap[i].flags & XE_VM_SNAP_FLAG_READ_ONLY ?
4567 			   "read_only" : "read_write",
4568 			   snap->snap[i].flags & XE_VM_SNAP_FLAG_IS_NULL ?
4569 			   "null_sparse" :
4570 			   snap->snap[i].flags & XE_VM_SNAP_FLAG_USERPTR ?
4571 			   "userptr" : "bo",
4572 			   snap->snap[i].uapi_mem_region == -1 ? 0 :
4573 			   BIT(snap->snap[i].uapi_mem_region),
4574 			   snap->snap[i].pat_index,
4575 			   snap->snap[i].cpu_caching);
4576 
4577 		if (IS_ERR(snap->snap[i].data)) {
4578 			drm_printf(p, "[%llx].error: %li\n", snap->snap[i].ofs,
4579 				   PTR_ERR(snap->snap[i].data));
4580 			continue;
4581 		}
4582 
4583 		if (snap->snap[i].flags & XE_VM_SNAP_FLAG_IS_NULL)
4584 			continue;
4585 
4586 		drm_printf(p, "[%llx].data: ", snap->snap[i].ofs);
4587 
4588 		for (j = 0; j < snap->snap[i].len; j += sizeof(u32)) {
4589 			u32 *val = snap->snap[i].data + j;
4590 			char dumped[ASCII85_BUFSZ];
4591 
4592 			drm_puts(p, ascii85_encode(*val, dumped));
4593 		}
4594 
4595 		drm_puts(p, "\n");
4596 
4597 		if (drm_coredump_printer_is_full(p))
4598 			return;
4599 	}
4600 }
4601 
4602 void xe_vm_snapshot_free(struct xe_vm_snapshot *snap)
4603 {
4604 	unsigned long i;
4605 
4606 	if (IS_ERR_OR_NULL(snap))
4607 		return;
4608 
4609 	for (i = 0; i < snap->num_snaps; i++) {
4610 		if (!IS_ERR(snap->snap[i].data))
4611 			kvfree(snap->snap[i].data);
4612 		xe_bo_put(snap->snap[i].bo);
4613 		if (snap->snap[i].mm)
4614 			mmput(snap->snap[i].mm);
4615 	}
4616 	kvfree(snap);
4617 }
4618 
4619 /**
4620  * xe_vma_need_vram_for_atomic - Check if VMA needs VRAM migration for atomic operations
4621  * @xe: Pointer to the Xe device structure
4622  * @vma: Pointer to the virtual memory area (VMA) structure
4623  * @is_atomic: In pagefault path and atomic operation
4624  *
4625  * This function determines whether the given VMA needs to be migrated to
4626  * VRAM in order to do atomic GPU operation.
4627  *
4628  * Return:
4629  *   1        - Migration to VRAM is required
4630  *   0        - Migration is not required
4631  *   -EACCES  - Invalid access for atomic memory attr
4632  *
4633  */
4634 int xe_vma_need_vram_for_atomic(struct xe_device *xe, struct xe_vma *vma, bool is_atomic)
4635 {
4636 	u32 atomic_access = xe_vma_bo(vma) ? xe_vma_bo(vma)->attr.atomic_access :
4637 					     vma->attr.atomic_access;
4638 
4639 	if (!IS_DGFX(xe) || !is_atomic)
4640 		return false;
4641 
4642 	/*
4643 	 * NOTE: The checks implemented here are platform-specific. For
4644 	 * instance, on a device supporting CXL atomics, these would ideally
4645 	 * work universally without additional handling.
4646 	 */
4647 	switch (atomic_access) {
4648 	case DRM_XE_ATOMIC_DEVICE:
4649 		return !xe->info.has_device_atomics_on_smem;
4650 
4651 	case DRM_XE_ATOMIC_CPU:
4652 		return -EACCES;
4653 
4654 	case DRM_XE_ATOMIC_UNDEFINED:
4655 	case DRM_XE_ATOMIC_GLOBAL:
4656 	default:
4657 		return 1;
4658 	}
4659 }
4660 
4661 static int xe_vm_alloc_vma(struct xe_vm *vm,
4662 			   struct drm_gpuvm_map_req *map_req,
4663 			   bool is_madvise)
4664 {
4665 	struct xe_vma_ops vops;
4666 	struct drm_gpuva_ops *ops = NULL;
4667 	struct drm_gpuva_op *__op;
4668 	unsigned int vma_flags = 0;
4669 	bool remap_op = false;
4670 	struct xe_vma_mem_attr tmp_attr = {};
4671 	u16 default_pat;
4672 	int err;
4673 
4674 	lockdep_assert_held_write(&vm->lock);
4675 
4676 	if (is_madvise)
4677 		ops = drm_gpuvm_madvise_ops_create(&vm->gpuvm, map_req);
4678 	else
4679 		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, map_req);
4680 
4681 	if (IS_ERR(ops))
4682 		return PTR_ERR(ops);
4683 
4684 	if (list_empty(&ops->list)) {
4685 		err = 0;
4686 		goto free_ops;
4687 	}
4688 
4689 	drm_gpuva_for_each_op(__op, ops) {
4690 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
4691 		struct xe_vma *vma = NULL;
4692 
4693 		if (!is_madvise) {
4694 			if (__op->op == DRM_GPUVA_OP_UNMAP) {
4695 				vma = gpuva_to_vma(op->base.unmap.va);
4696 				XE_WARN_ON(!xe_vma_has_default_mem_attrs(vma));
4697 				default_pat = vma->attr.default_pat_index;
4698 				vma_flags = vma->gpuva.flags;
4699 			}
4700 
4701 			if (__op->op == DRM_GPUVA_OP_REMAP) {
4702 				vma = gpuva_to_vma(op->base.remap.unmap->va);
4703 				default_pat = vma->attr.default_pat_index;
4704 				vma_flags = vma->gpuva.flags;
4705 			}
4706 
4707 			if (__op->op == DRM_GPUVA_OP_MAP) {
4708 				op->map.vma_flags |= vma_flags & XE_VMA_CREATE_MASK;
4709 				op->map.pat_index = default_pat;
4710 			}
4711 		} else {
4712 			if (__op->op == DRM_GPUVA_OP_REMAP) {
4713 				vma = gpuva_to_vma(op->base.remap.unmap->va);
4714 				xe_assert(vm->xe, !remap_op);
4715 				xe_assert(vm->xe, xe_vma_has_no_bo(vma));
4716 				remap_op = true;
4717 				vma_flags = vma->gpuva.flags;
4718 			}
4719 
4720 			if (__op->op == DRM_GPUVA_OP_MAP) {
4721 				xe_assert(vm->xe, remap_op);
4722 				remap_op = false;
4723 				/*
4724 				 * In case of madvise ops DRM_GPUVA_OP_MAP is
4725 				 * always after DRM_GPUVA_OP_REMAP, so ensure
4726 				 * to propagate the flags from the vma we're
4727 				 * unmapping.
4728 				 */
4729 				op->map.vma_flags |= vma_flags & XE_VMA_CREATE_MASK;
4730 			}
4731 		}
4732 		print_op(vm->xe, __op);
4733 	}
4734 
4735 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
4736 
4737 	if (is_madvise)
4738 		vops.flags |= XE_VMA_OPS_FLAG_MADVISE;
4739 	else
4740 		vops.flags |= XE_VMA_OPS_FLAG_ALLOW_SVM_UNMAP;
4741 
4742 	err = vm_bind_ioctl_ops_parse(vm, ops, &vops);
4743 	if (err)
4744 		goto unwind_ops;
4745 
4746 	xe_vm_lock(vm, false);
4747 
4748 	drm_gpuva_for_each_op(__op, ops) {
4749 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
4750 		struct xe_vma *vma;
4751 
4752 		if (__op->op == DRM_GPUVA_OP_UNMAP) {
4753 			vma = gpuva_to_vma(op->base.unmap.va);
4754 			/* There should be no unmap for madvise */
4755 			if (is_madvise)
4756 				XE_WARN_ON("UNEXPECTED UNMAP");
4757 
4758 			xe_vma_destroy(vma, NULL);
4759 		} else if (__op->op == DRM_GPUVA_OP_REMAP) {
4760 			vma = gpuva_to_vma(op->base.remap.unmap->va);
4761 			/* In case of madvise ops Store attributes for REMAP UNMAPPED
4762 			 * VMA, so they can be assigned to newly MAP created vma.
4763 			 */
4764 			if (is_madvise)
4765 				xe_vma_mem_attr_copy(&tmp_attr, &vma->attr);
4766 
4767 			xe_vma_destroy(gpuva_to_vma(op->base.remap.unmap->va), NULL);
4768 		} else if (__op->op == DRM_GPUVA_OP_MAP) {
4769 			vma = op->map.vma;
4770 			/* In case of madvise call, MAP will always be followed by REMAP.
4771 			 * Therefore temp_attr will always have sane values, making it safe to
4772 			 * copy them to new vma.
4773 			 */
4774 			if (is_madvise)
4775 				xe_vma_mem_attr_copy(&vma->attr, &tmp_attr);
4776 		}
4777 	}
4778 
4779 	xe_vm_unlock(vm);
4780 	drm_gpuva_ops_free(&vm->gpuvm, ops);
4781 	xe_vma_mem_attr_fini(&tmp_attr);
4782 	return 0;
4783 
4784 unwind_ops:
4785 	vm_bind_ioctl_ops_unwind(vm, &ops, 1);
4786 free_ops:
4787 	drm_gpuva_ops_free(&vm->gpuvm, ops);
4788 	return err;
4789 }
4790 
4791 /**
4792  * xe_vm_alloc_madvise_vma - Allocate VMA's with madvise ops
4793  * @vm: Pointer to the xe_vm structure
4794  * @start: Starting input address
4795  * @range: Size of the input range
4796  *
4797  * This function splits existing vma to create new vma for user provided input range
4798  *
4799  * Return: 0 if success
4800  */
4801 int xe_vm_alloc_madvise_vma(struct xe_vm *vm, uint64_t start, uint64_t range)
4802 {
4803 	struct drm_gpuvm_map_req map_req = {
4804 		.map.va.addr = start,
4805 		.map.va.range = range,
4806 	};
4807 
4808 	lockdep_assert_held_write(&vm->lock);
4809 
4810 	vm_dbg(&vm->xe->drm, "MADVISE_OPS_CREATE: addr=0x%016llx, size=0x%016llx", start, range);
4811 
4812 	return xe_vm_alloc_vma(vm, &map_req, true);
4813 }
4814 
4815 static bool is_cpu_addr_vma_with_default_attr(struct xe_vma *vma)
4816 {
4817 	return vma && xe_vma_is_cpu_addr_mirror(vma) &&
4818 	       xe_vma_has_default_mem_attrs(vma);
4819 }
4820 
4821 /**
4822  * xe_vm_find_cpu_addr_mirror_vma_range - Extend a VMA range to include adjacent CPU-mirrored VMAs
4823  * @vm: VM to search within
4824  * @start: Input/output pointer to the starting address of the range
4825  * @end: Input/output pointer to the end address of the range
4826  *
4827  * Given a range defined by @start and @range, this function checks the VMAs
4828  * immediately before and after the range. If those neighboring VMAs are
4829  * CPU-address-mirrored and have default memory attributes, the function
4830  * updates @start and @range to include them. This extended range can then
4831  * be used for merging or other operations that require a unified VMA.
4832  *
4833  * The function does not perform the merge itself; it only computes the
4834  * mergeable boundaries.
4835  */
4836 void xe_vm_find_cpu_addr_mirror_vma_range(struct xe_vm *vm, u64 *start, u64 *end)
4837 {
4838 	struct xe_vma *prev, *next;
4839 
4840 	lockdep_assert_held(&vm->lock);
4841 
4842 	if (*start >= SZ_4K) {
4843 		prev = xe_vm_find_vma_by_addr(vm, *start - SZ_4K);
4844 		if (is_cpu_addr_vma_with_default_attr(prev))
4845 			*start = xe_vma_start(prev);
4846 	}
4847 
4848 	if (*end < vm->size) {
4849 		next = xe_vm_find_vma_by_addr(vm, *end + 1);
4850 		if (is_cpu_addr_vma_with_default_attr(next))
4851 			*end = xe_vma_end(next);
4852 	}
4853 }
4854 
4855 /**
4856  * xe_vm_alloc_cpu_addr_mirror_vma - Allocate CPU addr mirror vma
4857  * @vm: Pointer to the xe_vm structure
4858  * @start: Starting input address
4859  * @range: Size of the input range
4860  *
4861  * This function splits/merges existing vma to create new vma for user provided input range
4862  *
4863  * Return: 0 if success
4864  */
4865 int xe_vm_alloc_cpu_addr_mirror_vma(struct xe_vm *vm, uint64_t start, uint64_t range)
4866 {
4867 	struct drm_gpuvm_map_req map_req = {
4868 		.map.va.addr = start,
4869 		.map.va.range = range,
4870 	};
4871 
4872 	lockdep_assert_held_write(&vm->lock);
4873 
4874 	vm_dbg(&vm->xe->drm, "CPU_ADDR_MIRROR_VMA_OPS_CREATE: addr=0x%016llx, size=0x%016llx",
4875 	       start, range);
4876 
4877 	return xe_vm_alloc_vma(vm, &map_req, false);
4878 }
4879 
4880 /**
4881  * xe_vm_add_exec_queue() - Add exec queue to VM
4882  * @vm: The VM.
4883  * @q: The exec_queue
4884  *
4885  * Add exec queue to VM, skipped if the device does not have context based TLB
4886  * invalidations.
4887  */
4888 void xe_vm_add_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
4889 {
4890 	struct xe_device *xe = vm->xe;
4891 
4892 	/* User VMs and queues only */
4893 	xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_KERNEL));
4894 	xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_PERMANENT));
4895 	xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_VM));
4896 	xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_MIGRATE));
4897 	xe_assert(xe, vm->xef);
4898 	xe_assert(xe, vm == q->vm);
4899 
4900 	if (!xe->info.has_ctx_tlb_inval)
4901 		return;
4902 
4903 	down_write(&vm->exec_queues.lock);
4904 	list_add(&q->vm_exec_queue_link, &vm->exec_queues.list[q->gt->info.id]);
4905 	++vm->exec_queues.count[q->gt->info.id];
4906 	up_write(&vm->exec_queues.lock);
4907 }
4908 
4909 /**
4910  * xe_vm_remove_exec_queue() - Remove exec queue from VM
4911  * @vm: The VM.
4912  * @q: The exec_queue
4913  *
4914  * Remove exec queue from VM, skipped if the device does not have context based
4915  * TLB invalidations.
4916  */
4917 void xe_vm_remove_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
4918 {
4919 	if (!vm->xe->info.has_ctx_tlb_inval)
4920 		return;
4921 
4922 	down_write(&vm->exec_queues.lock);
4923 	if (!list_empty(&q->vm_exec_queue_link)) {
4924 		list_del(&q->vm_exec_queue_link);
4925 		--vm->exec_queues.count[q->gt->info.id];
4926 	}
4927 	up_write(&vm->exec_queues.lock);
4928 }
4929