xref: /linux/drivers/gpu/drm/xe/xe_vm.c (revision a544da908a70240c3f379b374c35789e04710d42)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_vm.h"
7 
8 #include <linux/dma-fence-array.h>
9 #include <linux/nospec.h>
10 
11 #include <drm/drm_drv.h>
12 #include <drm/drm_exec.h>
13 #include <drm/drm_print.h>
14 #include <drm/ttm/ttm_tt.h>
15 #include <uapi/drm/xe_drm.h>
16 #include <linux/ascii85.h>
17 #include <linux/delay.h>
18 #include <linux/kthread.h>
19 #include <linux/mm.h>
20 #include <linux/swap.h>
21 
22 #include <generated/xe_wa_oob.h>
23 
24 #include "regs/xe_gtt_defs.h"
25 #include "xe_assert.h"
26 #include "xe_bo.h"
27 #include "xe_device.h"
28 #include "xe_drm_client.h"
29 #include "xe_exec_queue.h"
30 #include "xe_gt.h"
31 #include "xe_migrate.h"
32 #include "xe_pat.h"
33 #include "xe_pm.h"
34 #include "xe_preempt_fence.h"
35 #include "xe_pt.h"
36 #include "xe_pxp.h"
37 #include "xe_sriov_vf.h"
38 #include "xe_svm.h"
39 #include "xe_sync.h"
40 #include "xe_tile.h"
41 #include "xe_tlb_inval.h"
42 #include "xe_trace_bo.h"
43 #include "xe_vm_madvise.h"
44 #include "xe_wa.h"
45 
46 static struct drm_gem_object *xe_vm_obj(struct xe_vm *vm)
47 {
48 	return vm->gpuvm.r_obj;
49 }
50 
51 /**
52  * xe_vm_drm_exec_lock() - Lock the vm's resv with a drm_exec transaction
53  * @vm: The vm whose resv is to be locked.
54  * @exec: The drm_exec transaction.
55  *
56  * Helper to lock the vm's resv as part of a drm_exec transaction.
57  *
58  * Return: %0 on success. See drm_exec_lock_obj() for error codes.
59  */
60 int xe_vm_drm_exec_lock(struct xe_vm *vm, struct drm_exec *exec)
61 {
62 	return drm_exec_lock_obj(exec, xe_vm_obj(vm));
63 }
64 
65 static bool preempt_fences_waiting(struct xe_vm *vm)
66 {
67 	struct xe_exec_queue *q;
68 
69 	lockdep_assert_held(&vm->lock);
70 	xe_vm_assert_held(vm);
71 
72 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
73 		if (!q->lr.pfence ||
74 		    test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
75 			     &q->lr.pfence->flags)) {
76 			return true;
77 		}
78 	}
79 
80 	return false;
81 }
82 
83 static void free_preempt_fences(struct list_head *list)
84 {
85 	struct list_head *link, *next;
86 
87 	list_for_each_safe(link, next, list)
88 		xe_preempt_fence_free(to_preempt_fence_from_link(link));
89 }
90 
91 static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
92 				unsigned int *count)
93 {
94 	lockdep_assert_held(&vm->lock);
95 	xe_vm_assert_held(vm);
96 
97 	if (*count >= vm->preempt.num_exec_queues)
98 		return 0;
99 
100 	for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
101 		struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
102 
103 		if (IS_ERR(pfence))
104 			return PTR_ERR(pfence);
105 
106 		list_move_tail(xe_preempt_fence_link(pfence), list);
107 	}
108 
109 	return 0;
110 }
111 
112 static int wait_for_existing_preempt_fences(struct xe_vm *vm)
113 {
114 	struct xe_exec_queue *q;
115 	bool vf_migration = IS_SRIOV_VF(vm->xe) &&
116 		xe_sriov_vf_migration_supported(vm->xe);
117 	signed long wait_time = vf_migration ? HZ / 5 : MAX_SCHEDULE_TIMEOUT;
118 
119 	xe_vm_assert_held(vm);
120 
121 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
122 		if (q->lr.pfence) {
123 			long timeout;
124 
125 			timeout = dma_fence_wait_timeout(q->lr.pfence, false,
126 							 wait_time);
127 			if (!timeout) {
128 				xe_assert(vm->xe, vf_migration);
129 				return -EAGAIN;
130 			}
131 
132 			/* Only -ETIME on fence indicates VM needs to be killed */
133 			if (timeout < 0 || q->lr.pfence->error == -ETIME)
134 				return -ETIME;
135 
136 			dma_fence_put(q->lr.pfence);
137 			q->lr.pfence = NULL;
138 		}
139 	}
140 
141 	return 0;
142 }
143 
144 static bool xe_vm_is_idle(struct xe_vm *vm)
145 {
146 	struct xe_exec_queue *q;
147 
148 	xe_vm_assert_held(vm);
149 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
150 		if (!xe_exec_queue_is_idle(q))
151 			return false;
152 	}
153 
154 	return true;
155 }
156 
157 static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
158 {
159 	struct list_head *link;
160 	struct xe_exec_queue *q;
161 
162 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
163 		struct dma_fence *fence;
164 
165 		link = list->next;
166 		xe_assert(vm->xe, link != list);
167 
168 		fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
169 					     q, q->lr.context,
170 					     ++q->lr.seqno);
171 		dma_fence_put(q->lr.pfence);
172 		q->lr.pfence = fence;
173 	}
174 }
175 
176 static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
177 {
178 	struct xe_exec_queue *q;
179 	int err;
180 
181 	xe_bo_assert_held(bo);
182 
183 	if (!vm->preempt.num_exec_queues)
184 		return 0;
185 
186 	err = dma_resv_reserve_fences(bo->ttm.base.resv, vm->preempt.num_exec_queues);
187 	if (err)
188 		return err;
189 
190 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
191 		if (q->lr.pfence) {
192 			dma_resv_add_fence(bo->ttm.base.resv,
193 					   q->lr.pfence,
194 					   DMA_RESV_USAGE_BOOKKEEP);
195 		}
196 
197 	return 0;
198 }
199 
200 static void resume_and_reinstall_preempt_fences(struct xe_vm *vm,
201 						struct drm_exec *exec)
202 {
203 	struct xe_exec_queue *q;
204 
205 	lockdep_assert_held(&vm->lock);
206 	xe_vm_assert_held(vm);
207 
208 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
209 		q->ops->resume(q);
210 
211 		drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, q->lr.pfence,
212 					 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
213 	}
214 }
215 
216 int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
217 {
218 	struct drm_gpuvm_exec vm_exec = {
219 		.vm = &vm->gpuvm,
220 		.flags = DRM_EXEC_INTERRUPTIBLE_WAIT,
221 		.num_fences = 1,
222 	};
223 	struct drm_exec *exec = &vm_exec.exec;
224 	struct xe_validation_ctx ctx;
225 	struct dma_fence *pfence;
226 	int err;
227 	bool wait;
228 
229 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
230 
231 	down_write(&vm->lock);
232 	err = xe_validation_exec_lock(&ctx, &vm_exec, &vm->xe->val);
233 	if (err)
234 		goto out_up_write;
235 
236 	pfence = xe_preempt_fence_create(q, q->lr.context,
237 					 ++q->lr.seqno);
238 	if (IS_ERR(pfence)) {
239 		err = PTR_ERR(pfence);
240 		goto out_fini;
241 	}
242 
243 	list_add(&q->lr.link, &vm->preempt.exec_queues);
244 	++vm->preempt.num_exec_queues;
245 	q->lr.pfence = pfence;
246 
247 	xe_svm_notifier_lock(vm);
248 
249 	drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, pfence,
250 				 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
251 
252 	/*
253 	 * Check to see if a preemption on VM is in flight or userptr
254 	 * invalidation, if so trigger this preempt fence to sync state with
255 	 * other preempt fences on the VM.
256 	 */
257 	wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
258 	if (wait)
259 		dma_fence_enable_sw_signaling(pfence);
260 
261 	xe_svm_notifier_unlock(vm);
262 
263 out_fini:
264 	xe_validation_ctx_fini(&ctx);
265 out_up_write:
266 	up_write(&vm->lock);
267 
268 	return err;
269 }
270 ALLOW_ERROR_INJECTION(xe_vm_add_compute_exec_queue, ERRNO);
271 
272 /**
273  * xe_vm_remove_compute_exec_queue() - Remove compute exec queue from VM
274  * @vm: The VM.
275  * @q: The exec_queue
276  *
277  * Note that this function might be called multiple times on the same queue.
278  */
279 void xe_vm_remove_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
280 {
281 	if (!xe_vm_in_preempt_fence_mode(vm))
282 		return;
283 
284 	down_write(&vm->lock);
285 	if (!list_empty(&q->lr.link)) {
286 		list_del_init(&q->lr.link);
287 		--vm->preempt.num_exec_queues;
288 	}
289 	if (q->lr.pfence) {
290 		dma_fence_enable_sw_signaling(q->lr.pfence);
291 		dma_fence_put(q->lr.pfence);
292 		q->lr.pfence = NULL;
293 	}
294 	up_write(&vm->lock);
295 }
296 
297 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
298 
299 /**
300  * xe_vm_kill() - VM Kill
301  * @vm: The VM.
302  * @unlocked: Flag indicates the VM's dma-resv is not held
303  *
304  * Kill the VM by setting banned flag indicated VM is no longer available for
305  * use. If in preempt fence mode, also kill all exec queue attached to the VM.
306  */
307 void xe_vm_kill(struct xe_vm *vm, bool unlocked)
308 {
309 	struct xe_exec_queue *q;
310 
311 	lockdep_assert_held(&vm->lock);
312 
313 	if (unlocked)
314 		xe_vm_lock(vm, false);
315 
316 	vm->flags |= XE_VM_FLAG_BANNED;
317 	trace_xe_vm_kill(vm);
318 
319 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
320 		q->ops->kill(q);
321 
322 	if (unlocked)
323 		xe_vm_unlock(vm);
324 
325 	/* TODO: Inform user the VM is banned */
326 }
327 
328 static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
329 {
330 	struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
331 	struct xe_bo *bo = gem_to_xe_bo(vm_bo->obj);
332 	struct drm_gpuva *gpuva;
333 	int ret;
334 
335 	lockdep_assert_held(&vm->lock);
336 	drm_gpuvm_bo_for_each_va(gpuva, vm_bo)
337 		list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
338 			       &vm->rebind_list);
339 
340 	/* Skip re-populating purged BOs, rebind maps scratch pages. */
341 	if (xe_bo_is_purged(bo)) {
342 		vm_bo->evicted = false;
343 		return 0;
344 	}
345 
346 	if (!try_wait_for_completion(&vm->xe->pm_block))
347 		return -EAGAIN;
348 
349 	ret = xe_bo_validate(bo, vm, false, exec);
350 	if (ret)
351 		return ret;
352 
353 	vm_bo->evicted = false;
354 	return 0;
355 }
356 
357 /**
358  * xe_vm_validate_rebind() - Validate buffer objects and rebind vmas
359  * @vm: The vm for which we are rebinding.
360  * @exec: The struct drm_exec with the locked GEM objects.
361  * @num_fences: The number of fences to reserve for the operation, not
362  * including rebinds and validations.
363  *
364  * Validates all evicted gem objects and rebinds their vmas. Note that
365  * rebindings may cause evictions and hence the validation-rebind
366  * sequence is rerun until there are no more objects to validate.
367  *
368  * Return: 0 on success, negative error code on error. In particular,
369  * may return -EINTR or -ERESTARTSYS if interrupted, and -EDEADLK if
370  * the drm_exec transaction needs to be restarted.
371  */
372 int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,
373 			  unsigned int num_fences)
374 {
375 	struct drm_gem_object *obj;
376 	int ret;
377 
378 	do {
379 		ret = drm_gpuvm_validate(&vm->gpuvm, exec);
380 		if (ret)
381 			return ret;
382 
383 		ret = xe_vm_rebind(vm, false);
384 		if (ret)
385 			return ret;
386 	} while (!list_empty(&vm->gpuvm.evict.list));
387 
388 	drm_exec_for_each_locked_object(exec, obj) {
389 		ret = dma_resv_reserve_fences(obj->resv, num_fences);
390 		if (ret)
391 			return ret;
392 	}
393 
394 	return 0;
395 }
396 
397 static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
398 				 bool *done)
399 {
400 	int err;
401 
402 	err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, 0);
403 	if (err)
404 		return err;
405 
406 	if (xe_vm_is_idle(vm)) {
407 		vm->preempt.rebind_deactivated = true;
408 		*done = true;
409 		return 0;
410 	}
411 
412 	if (!preempt_fences_waiting(vm)) {
413 		*done = true;
414 		return 0;
415 	}
416 
417 	err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, 0);
418 	if (err)
419 		return err;
420 
421 	err = wait_for_existing_preempt_fences(vm);
422 	if (err)
423 		return err;
424 
425 	/*
426 	 * Add validation and rebinding to the locking loop since both can
427 	 * cause evictions which may require blocing dma_resv locks.
428 	 * The fence reservation here is intended for the new preempt fences
429 	 * we attach at the end of the rebind work.
430 	 */
431 	return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues);
432 }
433 
434 static bool vm_suspend_rebind_worker(struct xe_vm *vm)
435 {
436 	struct xe_device *xe = vm->xe;
437 	bool ret = false;
438 
439 	mutex_lock(&xe->rebind_resume_lock);
440 	if (!try_wait_for_completion(&vm->xe->pm_block)) {
441 		ret = true;
442 		list_move_tail(&vm->preempt.pm_activate_link, &xe->rebind_resume_list);
443 	}
444 	mutex_unlock(&xe->rebind_resume_lock);
445 
446 	return ret;
447 }
448 
449 /**
450  * xe_vm_resume_rebind_worker() - Resume the rebind worker.
451  * @vm: The vm whose preempt worker to resume.
452  *
453  * Resume a preempt worker that was previously suspended by
454  * vm_suspend_rebind_worker().
455  */
456 void xe_vm_resume_rebind_worker(struct xe_vm *vm)
457 {
458 	queue_work(vm->xe->ordered_wq, &vm->preempt.rebind_work);
459 }
460 
461 static void preempt_rebind_work_func(struct work_struct *w)
462 {
463 	struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
464 	struct xe_validation_ctx ctx;
465 	struct drm_exec exec;
466 	unsigned int fence_count = 0;
467 	LIST_HEAD(preempt_fences);
468 	int err = 0;
469 	long wait;
470 	int __maybe_unused tries = 0;
471 
472 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
473 	trace_xe_vm_rebind_worker_enter(vm);
474 
475 	down_write(&vm->lock);
476 
477 	if (xe_vm_is_closed_or_banned(vm)) {
478 		up_write(&vm->lock);
479 		trace_xe_vm_rebind_worker_exit(vm);
480 		return;
481 	}
482 
483 retry:
484 	if (!try_wait_for_completion(&vm->xe->pm_block) && vm_suspend_rebind_worker(vm)) {
485 		up_write(&vm->lock);
486 		/* We don't actually block but don't make progress. */
487 		xe_pm_might_block_on_suspend();
488 		return;
489 	}
490 
491 	if (xe_vm_userptr_check_repin(vm)) {
492 		err = xe_vm_userptr_pin(vm);
493 		if (err)
494 			goto out_unlock_outer;
495 	}
496 
497 	err = xe_validation_ctx_init(&ctx, &vm->xe->val, &exec,
498 				     (struct xe_val_flags) {.interruptible = true});
499 	if (err)
500 		goto out_unlock_outer;
501 
502 	drm_exec_until_all_locked(&exec) {
503 		bool done = false;
504 
505 		err = xe_preempt_work_begin(&exec, vm, &done);
506 		drm_exec_retry_on_contention(&exec);
507 		xe_validation_retry_on_oom(&ctx, &err);
508 		if (err || done) {
509 			xe_validation_ctx_fini(&ctx);
510 			goto out_unlock_outer;
511 		}
512 	}
513 
514 	err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
515 	if (err)
516 		goto out_unlock;
517 
518 	xe_vm_set_validation_exec(vm, &exec);
519 	err = xe_vm_rebind(vm, true);
520 	xe_vm_set_validation_exec(vm, NULL);
521 	if (err)
522 		goto out_unlock;
523 
524 	/* Wait on rebinds and munmap style VM unbinds */
525 	wait = dma_resv_wait_timeout(xe_vm_resv(vm),
526 				     DMA_RESV_USAGE_KERNEL,
527 				     false, MAX_SCHEDULE_TIMEOUT);
528 	if (wait <= 0) {
529 		err = -ETIME;
530 		goto out_unlock;
531 	}
532 
533 #define retry_required(__tries, __vm) \
534 	(IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
535 	(!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
536 	__xe_vm_userptr_needs_repin(__vm))
537 
538 	xe_svm_notifier_lock(vm);
539 	if (retry_required(tries, vm)) {
540 		xe_svm_notifier_unlock(vm);
541 		err = -EAGAIN;
542 		goto out_unlock;
543 	}
544 
545 #undef retry_required
546 
547 	spin_lock(&vm->xe->ttm.lru_lock);
548 	ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
549 	spin_unlock(&vm->xe->ttm.lru_lock);
550 
551 	/* Point of no return. */
552 	arm_preempt_fences(vm, &preempt_fences);
553 	resume_and_reinstall_preempt_fences(vm, &exec);
554 	xe_svm_notifier_unlock(vm);
555 
556 out_unlock:
557 	xe_validation_ctx_fini(&ctx);
558 out_unlock_outer:
559 	if (err == -EAGAIN) {
560 		trace_xe_vm_rebind_worker_retry(vm);
561 
562 		/*
563 		 * We can't block in workers on a VF which supports migration
564 		 * given this can block the VF post-migration workers from
565 		 * getting scheduled.
566 		 */
567 		if (IS_SRIOV_VF(vm->xe) &&
568 		    xe_sriov_vf_migration_supported(vm->xe)) {
569 			up_write(&vm->lock);
570 			xe_vm_queue_rebind_worker(vm);
571 			return;
572 		}
573 
574 		goto retry;
575 	}
576 
577 	if (err) {
578 		drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
579 		xe_vm_kill(vm, true);
580 	}
581 	up_write(&vm->lock);
582 
583 	free_preempt_fences(&preempt_fences);
584 
585 	trace_xe_vm_rebind_worker_exit(vm);
586 }
587 
588 /**
589  * xe_vm_add_fault_entry_pf() - Add pagefault to vm fault list
590  * @vm: The VM.
591  * @pf: The pagefault.
592  *
593  * This function takes the data from the pagefault @pf and saves it to @vm->faults.list.
594  *
595  * The function exits silently if the list is full, and reports a warning if the pagefault
596  * could not be saved to the list.
597  */
598 void xe_vm_add_fault_entry_pf(struct xe_vm *vm, struct xe_pagefault *pf)
599 {
600 	struct xe_vm_fault_entry *e;
601 	struct xe_hw_engine *hwe;
602 
603 	/* Do not report faults on reserved engines */
604 	hwe = xe_gt_hw_engine(pf->gt, pf->consumer.engine_class,
605 			      pf->consumer.engine_instance, false);
606 	if (!hwe || xe_hw_engine_is_reserved(hwe))
607 		return;
608 
609 	e = kzalloc_obj(*e);
610 	if (!e) {
611 		drm_warn(&vm->xe->drm,
612 			 "Could not allocate memory for fault!\n");
613 		return;
614 	}
615 
616 	guard(spinlock)(&vm->faults.lock);
617 
618 	/*
619 	 * Limit the number of faults in the fault list to prevent
620 	 * memory overuse.
621 	 */
622 	if (vm->faults.len >= MAX_FAULTS_SAVED_PER_VM) {
623 		kfree(e);
624 		return;
625 	}
626 
627 	e->address = pf->consumer.page_addr;
628 	/*
629 	 * TODO:
630 	 * Address precision is currently always SZ_4K, but this may change
631 	 * in the future.
632 	 */
633 	e->address_precision = SZ_4K;
634 	e->access_type = pf->consumer.access_type;
635 	e->fault_type = FIELD_GET(XE_PAGEFAULT_TYPE_MASK,
636 				  pf->consumer.fault_type_level);
637 	e->fault_level = FIELD_GET(XE_PAGEFAULT_LEVEL_MASK,
638 				   pf->consumer.fault_type_level);
639 
640 	list_add_tail(&e->list, &vm->faults.list);
641 	vm->faults.len++;
642 }
643 
644 static void xe_vm_clear_fault_entries(struct xe_vm *vm)
645 {
646 	struct xe_vm_fault_entry *e, *tmp;
647 
648 	guard(spinlock)(&vm->faults.lock);
649 	list_for_each_entry_safe(e, tmp, &vm->faults.list, list) {
650 		list_del(&e->list);
651 		kfree(e);
652 	}
653 	vm->faults.len = 0;
654 }
655 
656 static int xe_vma_ops_alloc(struct xe_vma_ops *vops, bool array_of_binds)
657 {
658 	int i;
659 
660 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i) {
661 		if (!vops->pt_update_ops[i].num_ops)
662 			continue;
663 
664 		vops->pt_update_ops[i].ops =
665 			kmalloc_objs(*vops->pt_update_ops[i].ops,
666 				     vops->pt_update_ops[i].num_ops,
667 				     GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
668 		if (!vops->pt_update_ops[i].ops)
669 			return array_of_binds ? -ENOBUFS : -ENOMEM;
670 	}
671 
672 	return 0;
673 }
674 ALLOW_ERROR_INJECTION(xe_vma_ops_alloc, ERRNO);
675 
676 static void xe_vma_svm_prefetch_op_fini(struct xe_vma_op *op)
677 {
678 	struct xe_vma *vma;
679 
680 	vma = gpuva_to_vma(op->base.prefetch.va);
681 
682 	if (op->base.op == DRM_GPUVA_OP_PREFETCH && xe_vma_is_cpu_addr_mirror(vma))
683 		xa_destroy(&op->prefetch_range.range);
684 }
685 
686 static void xe_vma_svm_prefetch_ops_fini(struct xe_vma_ops *vops)
687 {
688 	struct xe_vma_op *op;
689 
690 	if (!(vops->flags & XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH))
691 		return;
692 
693 	list_for_each_entry(op, &vops->list, link)
694 		xe_vma_svm_prefetch_op_fini(op);
695 }
696 
697 static void xe_vma_ops_fini(struct xe_vma_ops *vops)
698 {
699 	int i;
700 
701 	xe_vma_svm_prefetch_ops_fini(vops);
702 
703 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
704 		kfree(vops->pt_update_ops[i].ops);
705 }
706 
707 static void xe_vma_ops_incr_pt_update_ops(struct xe_vma_ops *vops, u8 tile_mask, int inc_val)
708 {
709 	int i;
710 
711 	if (!inc_val)
712 		return;
713 
714 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
715 		if (BIT(i) & tile_mask)
716 			vops->pt_update_ops[i].num_ops += inc_val;
717 }
718 
719 #define XE_VMA_CREATE_MASK (		    \
720 	XE_VMA_READ_ONLY |		    \
721 	XE_VMA_DUMPABLE |		    \
722 	XE_VMA_SYSTEM_ALLOCATOR |           \
723 	DRM_GPUVA_SPARSE |		    \
724 	XE_VMA_MADV_AUTORESET)
725 
726 static void xe_vm_populate_rebind(struct xe_vma_op *op, struct xe_vma *vma,
727 				  u8 tile_mask)
728 {
729 	INIT_LIST_HEAD(&op->link);
730 	op->tile_mask = tile_mask;
731 	op->base.op = DRM_GPUVA_OP_MAP;
732 	op->base.map.va.addr = vma->gpuva.va.addr;
733 	op->base.map.va.range = vma->gpuva.va.range;
734 	op->base.map.gem.obj = vma->gpuva.gem.obj;
735 	op->base.map.gem.offset = vma->gpuva.gem.offset;
736 	op->map.vma = vma;
737 	op->map.immediate = true;
738 	op->map.vma_flags = vma->gpuva.flags & XE_VMA_CREATE_MASK;
739 }
740 
741 static int xe_vm_ops_add_rebind(struct xe_vma_ops *vops, struct xe_vma *vma,
742 				u8 tile_mask)
743 {
744 	struct xe_vma_op *op;
745 
746 	op = kzalloc_obj(*op);
747 	if (!op)
748 		return -ENOMEM;
749 
750 	xe_vm_populate_rebind(op, vma, tile_mask);
751 	list_add_tail(&op->link, &vops->list);
752 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask, 1);
753 
754 	return 0;
755 }
756 
757 static struct dma_fence *ops_execute(struct xe_vm *vm,
758 				     struct xe_vma_ops *vops);
759 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
760 			    struct xe_exec_queue *q,
761 			    struct xe_sync_entry *syncs, u32 num_syncs);
762 
763 int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
764 {
765 	struct dma_fence *fence;
766 	struct xe_vma *vma, *next;
767 	struct xe_vma_ops vops;
768 	struct xe_vma_op *op, *next_op;
769 	int err, i;
770 
771 	lockdep_assert_held(&vm->lock);
772 	if ((xe_vm_in_lr_mode(vm) && !rebind_worker) ||
773 	    list_empty(&vm->rebind_list))
774 		return 0;
775 
776 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
777 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
778 		vops.pt_update_ops[i].wait_vm_bookkeep = true;
779 
780 	xe_vm_assert_held(vm);
781 	list_for_each_entry(vma, &vm->rebind_list, combined_links.rebind) {
782 		xe_assert(vm->xe, vma->tile_present);
783 
784 		if (rebind_worker)
785 			trace_xe_vma_rebind_worker(vma);
786 		else
787 			trace_xe_vma_rebind_exec(vma);
788 
789 		err = xe_vm_ops_add_rebind(&vops, vma,
790 					   vma->tile_present);
791 		if (err)
792 			goto free_ops;
793 	}
794 
795 	err = xe_vma_ops_alloc(&vops, false);
796 	if (err)
797 		goto free_ops;
798 
799 	fence = ops_execute(vm, &vops);
800 	if (IS_ERR(fence)) {
801 		err = PTR_ERR(fence);
802 	} else {
803 		dma_fence_put(fence);
804 		list_for_each_entry_safe(vma, next, &vm->rebind_list,
805 					 combined_links.rebind)
806 			list_del_init(&vma->combined_links.rebind);
807 	}
808 free_ops:
809 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
810 		list_del(&op->link);
811 		kfree(op);
812 	}
813 	xe_vma_ops_fini(&vops);
814 
815 	return err;
816 }
817 
818 struct dma_fence *xe_vma_rebind(struct xe_vm *vm, struct xe_vma *vma, u8 tile_mask)
819 {
820 	struct dma_fence *fence = NULL;
821 	struct xe_vma_ops vops;
822 	struct xe_vma_op *op, *next_op;
823 	struct xe_tile *tile;
824 	u8 id;
825 	int err;
826 
827 	lockdep_assert_held(&vm->lock);
828 	xe_vm_assert_held(vm);
829 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
830 
831 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
832 	vops.flags |= XE_VMA_OPS_FLAG_SKIP_TLB_WAIT;
833 	for_each_tile(tile, vm->xe, id) {
834 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
835 		vops.pt_update_ops[tile->id].q =
836 			xe_migrate_exec_queue(tile->migrate);
837 	}
838 
839 	err = xe_vm_ops_add_rebind(&vops, vma, tile_mask);
840 	if (err)
841 		return ERR_PTR(err);
842 
843 	err = xe_vma_ops_alloc(&vops, false);
844 	if (err) {
845 		fence = ERR_PTR(err);
846 		goto free_ops;
847 	}
848 
849 	fence = ops_execute(vm, &vops);
850 
851 free_ops:
852 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
853 		list_del(&op->link);
854 		kfree(op);
855 	}
856 	xe_vma_ops_fini(&vops);
857 
858 	return fence;
859 }
860 
861 static void xe_vm_populate_range_rebind(struct xe_vma_op *op,
862 					struct xe_vma *vma,
863 					struct xe_svm_range *range,
864 					u8 tile_mask)
865 {
866 	INIT_LIST_HEAD(&op->link);
867 	op->tile_mask = tile_mask;
868 	op->base.op = DRM_GPUVA_OP_DRIVER;
869 	op->subop = XE_VMA_SUBOP_MAP_RANGE;
870 	op->map_range.vma = vma;
871 	op->map_range.range = range;
872 }
873 
874 static int
875 xe_vm_ops_add_range_rebind(struct xe_vma_ops *vops,
876 			   struct xe_vma *vma,
877 			   struct xe_svm_range *range,
878 			   u8 tile_mask)
879 {
880 	struct xe_vma_op *op;
881 
882 	op = kzalloc_obj(*op);
883 	if (!op)
884 		return -ENOMEM;
885 
886 	xe_vm_populate_range_rebind(op, vma, range, tile_mask);
887 	list_add_tail(&op->link, &vops->list);
888 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask, 1);
889 
890 	return 0;
891 }
892 
893 /**
894  * xe_vm_range_rebind() - VM range (re)bind
895  * @vm: The VM which the range belongs to.
896  * @vma: The VMA which the range belongs to.
897  * @range: SVM range to rebind.
898  * @tile_mask: Tile mask to bind the range to.
899  *
900  * (re)bind SVM range setting up GPU page tables for the range.
901  *
902  * Return: dma fence for rebind to signal completion on success, ERR_PTR on
903  * failure
904  */
905 struct dma_fence *xe_vm_range_rebind(struct xe_vm *vm,
906 				     struct xe_vma *vma,
907 				     struct xe_svm_range *range,
908 				     u8 tile_mask)
909 {
910 	struct dma_fence *fence = NULL;
911 	struct xe_vma_ops vops;
912 	struct xe_vma_op *op, *next_op;
913 	struct xe_tile *tile;
914 	u8 id;
915 	int err;
916 
917 	lockdep_assert_held(&vm->lock);
918 	xe_vm_assert_held(vm);
919 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
920 	xe_assert(vm->xe, xe_vma_is_cpu_addr_mirror(vma));
921 
922 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
923 	vops.flags |= XE_VMA_OPS_FLAG_SKIP_TLB_WAIT;
924 	for_each_tile(tile, vm->xe, id) {
925 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
926 		vops.pt_update_ops[tile->id].q =
927 			xe_migrate_exec_queue(tile->migrate);
928 	}
929 
930 	err = xe_vm_ops_add_range_rebind(&vops, vma, range, tile_mask);
931 	if (err)
932 		return ERR_PTR(err);
933 
934 	err = xe_vma_ops_alloc(&vops, false);
935 	if (err) {
936 		fence = ERR_PTR(err);
937 		goto free_ops;
938 	}
939 
940 	fence = ops_execute(vm, &vops);
941 
942 free_ops:
943 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
944 		list_del(&op->link);
945 		kfree(op);
946 	}
947 	xe_vma_ops_fini(&vops);
948 
949 	return fence;
950 }
951 
952 static void xe_vm_populate_range_unbind(struct xe_vma_op *op,
953 					struct xe_svm_range *range)
954 {
955 	INIT_LIST_HEAD(&op->link);
956 	op->tile_mask = range->tile_present;
957 	op->base.op = DRM_GPUVA_OP_DRIVER;
958 	op->subop = XE_VMA_SUBOP_UNMAP_RANGE;
959 	op->unmap_range.range = range;
960 }
961 
962 static int
963 xe_vm_ops_add_range_unbind(struct xe_vma_ops *vops,
964 			   struct xe_svm_range *range)
965 {
966 	struct xe_vma_op *op;
967 
968 	op = kzalloc_obj(*op);
969 	if (!op)
970 		return -ENOMEM;
971 
972 	xe_vm_populate_range_unbind(op, range);
973 	list_add_tail(&op->link, &vops->list);
974 	xe_vma_ops_incr_pt_update_ops(vops, range->tile_present, 1);
975 
976 	return 0;
977 }
978 
979 /**
980  * xe_vm_range_unbind() - VM range unbind
981  * @vm: The VM which the range belongs to.
982  * @range: SVM range to rebind.
983  *
984  * Unbind SVM range removing the GPU page tables for the range.
985  *
986  * Return: dma fence for unbind to signal completion on success, ERR_PTR on
987  * failure
988  */
989 struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm,
990 				     struct xe_svm_range *range)
991 {
992 	struct dma_fence *fence = NULL;
993 	struct xe_vma_ops vops;
994 	struct xe_vma_op *op, *next_op;
995 	struct xe_tile *tile;
996 	u8 id;
997 	int err;
998 
999 	lockdep_assert_held(&vm->lock);
1000 	xe_vm_assert_held(vm);
1001 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
1002 
1003 	if (!range->tile_present)
1004 		return dma_fence_get_stub();
1005 
1006 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
1007 	for_each_tile(tile, vm->xe, id) {
1008 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
1009 		vops.pt_update_ops[tile->id].q =
1010 			xe_migrate_exec_queue(tile->migrate);
1011 	}
1012 
1013 	err = xe_vm_ops_add_range_unbind(&vops, range);
1014 	if (err)
1015 		return ERR_PTR(err);
1016 
1017 	err = xe_vma_ops_alloc(&vops, false);
1018 	if (err) {
1019 		fence = ERR_PTR(err);
1020 		goto free_ops;
1021 	}
1022 
1023 	fence = ops_execute(vm, &vops);
1024 
1025 free_ops:
1026 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
1027 		list_del(&op->link);
1028 		kfree(op);
1029 	}
1030 	xe_vma_ops_fini(&vops);
1031 
1032 	return fence;
1033 }
1034 
1035 static void xe_vma_mem_attr_fini(struct xe_vma_mem_attr *attr)
1036 {
1037 	drm_pagemap_put(attr->preferred_loc.dpagemap);
1038 }
1039 
1040 static void xe_vma_free(struct xe_vma *vma)
1041 {
1042 	xe_vma_mem_attr_fini(&vma->attr);
1043 
1044 	if (xe_vma_is_userptr(vma))
1045 		kfree(to_userptr_vma(vma));
1046 	else
1047 		kfree(vma);
1048 }
1049 
1050 /**
1051  * xe_vma_mem_attr_copy() - copy an xe_vma_mem_attr structure.
1052  * @to: Destination.
1053  * @from: Source.
1054  *
1055  * Copies an xe_vma_mem_attr structure taking care to get reference
1056  * counting of individual members right.
1057  */
1058 void xe_vma_mem_attr_copy(struct xe_vma_mem_attr *to, struct xe_vma_mem_attr *from)
1059 {
1060 	xe_vma_mem_attr_fini(to);
1061 	*to = *from;
1062 	if (to->preferred_loc.dpagemap)
1063 		drm_pagemap_get(to->preferred_loc.dpagemap);
1064 }
1065 
1066 static struct xe_vma *xe_vma_create(struct xe_vm *vm,
1067 				    struct xe_bo *bo,
1068 				    u64 bo_offset_or_userptr,
1069 				    u64 start, u64 end,
1070 				    struct xe_vma_mem_attr *attr,
1071 				    unsigned int flags)
1072 {
1073 	struct xe_vma *vma;
1074 	struct xe_tile *tile;
1075 	u8 id;
1076 	bool is_null = (flags & DRM_GPUVA_SPARSE);
1077 	bool is_cpu_addr_mirror = (flags & XE_VMA_SYSTEM_ALLOCATOR);
1078 
1079 	xe_assert(vm->xe, start < end);
1080 	xe_assert(vm->xe, end < vm->size);
1081 
1082 	/*
1083 	 * Allocate and ensure that the xe_vma_is_userptr() return
1084 	 * matches what was allocated.
1085 	 */
1086 	if (!bo && !is_null && !is_cpu_addr_mirror) {
1087 		struct xe_userptr_vma *uvma = kzalloc_obj(*uvma);
1088 
1089 		if (!uvma)
1090 			return ERR_PTR(-ENOMEM);
1091 
1092 		vma = &uvma->vma;
1093 	} else {
1094 		vma = kzalloc_obj(*vma);
1095 		if (!vma)
1096 			return ERR_PTR(-ENOMEM);
1097 
1098 		if (bo)
1099 			vma->gpuva.gem.obj = &bo->ttm.base;
1100 	}
1101 
1102 	INIT_LIST_HEAD(&vma->combined_links.rebind);
1103 
1104 	INIT_LIST_HEAD(&vma->gpuva.gem.entry);
1105 	vma->gpuva.vm = &vm->gpuvm;
1106 	vma->gpuva.va.addr = start;
1107 	vma->gpuva.va.range = end - start + 1;
1108 	vma->gpuva.flags = flags;
1109 
1110 	for_each_tile(tile, vm->xe, id)
1111 		vma->tile_mask |= 0x1 << id;
1112 
1113 	if (vm->xe->info.has_atomic_enable_pte_bit)
1114 		vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
1115 
1116 	xe_vma_mem_attr_copy(&vma->attr, attr);
1117 	if (bo) {
1118 		struct drm_gpuvm_bo *vm_bo;
1119 
1120 		xe_bo_assert_held(bo);
1121 
1122 		/*
1123 		 * Reject only WILLNEED mappings on DONTNEED/PURGED BOs. This
1124 		 * gates new vm_bind ioctls (user supplies WILLNEED) while
1125 		 * still allowing partial-unbind / remap splits whose new VMAs
1126 		 * inherit the parent's DONTNEED attr. It must also run before
1127 		 * xe_bo_willneed_get_locked() below so a 0->1 holder bump
1128 		 * cannot silently promote DONTNEED back to WILLNEED.
1129 		 */
1130 		if (vma->attr.purgeable_state == XE_MADV_PURGEABLE_WILLNEED) {
1131 			if (xe_bo_madv_is_dontneed(bo)) {
1132 				xe_vma_free(vma);
1133 				return ERR_PTR(-EBUSY);
1134 			}
1135 			if (xe_bo_is_purged(bo)) {
1136 				xe_vma_free(vma);
1137 				return ERR_PTR(-EINVAL);
1138 			}
1139 		}
1140 
1141 		vm_bo = drm_gpuvm_bo_obtain_locked(vma->gpuva.vm, &bo->ttm.base);
1142 		if (IS_ERR(vm_bo)) {
1143 			xe_vma_free(vma);
1144 			return ERR_CAST(vm_bo);
1145 		}
1146 
1147 		drm_gpuvm_bo_extobj_add(vm_bo);
1148 		drm_gem_object_get(&bo->ttm.base);
1149 		vma->gpuva.gem.offset = bo_offset_or_userptr;
1150 		drm_gpuva_link(&vma->gpuva, vm_bo);
1151 		drm_gpuvm_bo_put(vm_bo);
1152 
1153 		xe_bo_vma_count_inc_locked(bo);
1154 		if (vma->attr.purgeable_state == XE_MADV_PURGEABLE_WILLNEED)
1155 			xe_bo_willneed_get_locked(bo);
1156 	} else /* userptr or null */ {
1157 		if (!is_null && !is_cpu_addr_mirror) {
1158 			struct xe_userptr_vma *uvma = to_userptr_vma(vma);
1159 			u64 size = end - start + 1;
1160 			int err;
1161 
1162 			vma->gpuva.gem.offset = bo_offset_or_userptr;
1163 
1164 			err = xe_userptr_setup(uvma, xe_vma_userptr(vma), size);
1165 			if (err) {
1166 				xe_vma_free(vma);
1167 				return ERR_PTR(err);
1168 			}
1169 		}
1170 
1171 		xe_vm_get(vm);
1172 	}
1173 
1174 	return vma;
1175 }
1176 
1177 static void xe_vma_destroy_late(struct xe_vma *vma)
1178 {
1179 	struct xe_vm *vm = xe_vma_vm(vma);
1180 	struct xe_bo *bo = xe_vma_bo(vma);
1181 
1182 	if (vma->ufence) {
1183 		xe_sync_ufence_put(vma->ufence);
1184 		vma->ufence = NULL;
1185 	}
1186 
1187 	if (xe_vma_is_userptr(vma)) {
1188 		struct xe_userptr_vma *uvma = to_userptr_vma(vma);
1189 
1190 		xe_userptr_remove(uvma);
1191 		xe_vm_put(vm);
1192 	} else if (xe_vma_is_null(vma) || xe_vma_is_cpu_addr_mirror(vma)) {
1193 		xe_vm_put(vm);
1194 	} else {
1195 		xe_bo_put(bo);
1196 	}
1197 
1198 	xe_vma_free(vma);
1199 }
1200 
1201 static void vma_destroy_work_func(struct work_struct *w)
1202 {
1203 	struct xe_vma *vma =
1204 		container_of(w, struct xe_vma, destroy_work);
1205 
1206 	xe_vma_destroy_late(vma);
1207 }
1208 
1209 static void vma_destroy_cb(struct dma_fence *fence,
1210 			   struct dma_fence_cb *cb)
1211 {
1212 	struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
1213 
1214 	INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
1215 	queue_work(system_dfl_wq, &vma->destroy_work);
1216 }
1217 
1218 static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
1219 {
1220 	struct xe_vm *vm = xe_vma_vm(vma);
1221 	struct xe_bo *bo = xe_vma_bo(vma);
1222 
1223 	lockdep_assert_held_write(&vm->lock);
1224 	xe_assert(vm->xe, list_empty(&vma->combined_links.destroy));
1225 
1226 	if (xe_vma_is_userptr(vma)) {
1227 		xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED);
1228 		xe_userptr_destroy(to_userptr_vma(vma));
1229 	} else if (!xe_vma_is_null(vma) && !xe_vma_is_cpu_addr_mirror(vma)) {
1230 		xe_bo_assert_held(bo);
1231 
1232 		drm_gpuva_unlink(&vma->gpuva);
1233 
1234 		xe_bo_vma_count_dec_locked(bo);
1235 		if (vma->attr.purgeable_state == XE_MADV_PURGEABLE_WILLNEED)
1236 			xe_bo_willneed_put_locked(bo);
1237 	}
1238 
1239 	xe_vm_assert_held(vm);
1240 	if (fence) {
1241 		int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
1242 						 vma_destroy_cb);
1243 
1244 		if (ret) {
1245 			XE_WARN_ON(ret != -ENOENT);
1246 			xe_vma_destroy_late(vma);
1247 		}
1248 	} else {
1249 		xe_vma_destroy_late(vma);
1250 	}
1251 }
1252 
1253 /**
1254  * xe_vm_lock_vma() - drm_exec utility to lock a vma
1255  * @exec: The drm_exec object we're currently locking for.
1256  * @vma: The vma for witch we want to lock the vm resv and any attached
1257  * object's resv.
1258  *
1259  * Return: 0 on success, negative error code on error. In particular
1260  * may return -EDEADLK on WW transaction contention and -EINTR if
1261  * an interruptible wait is terminated by a signal.
1262  */
1263 int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
1264 {
1265 	struct xe_vm *vm = xe_vma_vm(vma);
1266 	struct xe_bo *bo = xe_vma_bo(vma);
1267 	int err;
1268 
1269 	XE_WARN_ON(!vm);
1270 
1271 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
1272 	if (!err && bo && !bo->vm)
1273 		err = drm_exec_lock_obj(exec, &bo->ttm.base);
1274 
1275 	return err;
1276 }
1277 
1278 static void xe_vma_destroy_unlocked(struct xe_vma *vma)
1279 {
1280 	struct xe_device *xe = xe_vma_vm(vma)->xe;
1281 	struct xe_validation_ctx ctx;
1282 	struct drm_exec exec;
1283 	int err = 0;
1284 
1285 	xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {}, err) {
1286 		err = xe_vm_lock_vma(&exec, vma);
1287 		drm_exec_retry_on_contention(&exec);
1288 		if (XE_WARN_ON(err))
1289 			break;
1290 		xe_vma_destroy(vma, NULL);
1291 	}
1292 	xe_assert(xe, !err);
1293 }
1294 
1295 struct xe_vma *
1296 xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
1297 {
1298 	struct drm_gpuva *gpuva;
1299 
1300 	lockdep_assert_held(&vm->lock);
1301 
1302 	if (xe_vm_is_closed_or_banned(vm))
1303 		return NULL;
1304 
1305 	xe_assert(vm->xe, start + range <= vm->size);
1306 
1307 	gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
1308 
1309 	return gpuva ? gpuva_to_vma(gpuva) : NULL;
1310 }
1311 
1312 static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
1313 {
1314 	int err;
1315 
1316 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1317 	lockdep_assert_held(&vm->lock);
1318 
1319 	mutex_lock(&vm->snap_mutex);
1320 	err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
1321 	mutex_unlock(&vm->snap_mutex);
1322 	XE_WARN_ON(err);	/* Shouldn't be possible */
1323 
1324 	return err;
1325 }
1326 
1327 static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
1328 {
1329 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1330 	lockdep_assert_held(&vm->lock);
1331 
1332 	mutex_lock(&vm->snap_mutex);
1333 	drm_gpuva_remove(&vma->gpuva);
1334 	mutex_unlock(&vm->snap_mutex);
1335 	if (vm->usm.last_fault_vma == vma)
1336 		vm->usm.last_fault_vma = NULL;
1337 }
1338 
1339 static struct drm_gpuva_op *xe_vm_op_alloc(void)
1340 {
1341 	struct xe_vma_op *op;
1342 
1343 	op = kzalloc_obj(*op);
1344 
1345 	if (unlikely(!op))
1346 		return NULL;
1347 
1348 	return &op->base;
1349 }
1350 
1351 static void xe_vm_free(struct drm_gpuvm *gpuvm);
1352 
1353 static const struct drm_gpuvm_ops gpuvm_ops = {
1354 	.op_alloc = xe_vm_op_alloc,
1355 	.vm_bo_validate = xe_gpuvm_validate,
1356 	.vm_free = xe_vm_free,
1357 };
1358 
1359 static u64 pde_encode_pat_index(u16 pat_index)
1360 {
1361 	u64 pte = 0;
1362 
1363 	if (pat_index & BIT(0))
1364 		pte |= XE_PPGTT_PTE_PAT0;
1365 
1366 	if (pat_index & BIT(1))
1367 		pte |= XE_PPGTT_PTE_PAT1;
1368 
1369 	return pte;
1370 }
1371 
1372 static u64 pte_encode_pat_index(u16 pat_index, u32 pt_level)
1373 {
1374 	u64 pte = 0;
1375 
1376 	if (pat_index & BIT(0))
1377 		pte |= XE_PPGTT_PTE_PAT0;
1378 
1379 	if (pat_index & BIT(1))
1380 		pte |= XE_PPGTT_PTE_PAT1;
1381 
1382 	if (pat_index & BIT(2)) {
1383 		if (pt_level)
1384 			pte |= XE_PPGTT_PDE_PDPE_PAT2;
1385 		else
1386 			pte |= XE_PPGTT_PTE_PAT2;
1387 	}
1388 
1389 	if (pat_index & BIT(3))
1390 		pte |= XELPG_PPGTT_PTE_PAT3;
1391 
1392 	if (pat_index & (BIT(4)))
1393 		pte |= XE2_PPGTT_PTE_PAT4;
1394 
1395 	return pte;
1396 }
1397 
1398 static u64 pte_encode_ps(u32 pt_level)
1399 {
1400 	XE_WARN_ON(pt_level > MAX_HUGEPTE_LEVEL);
1401 
1402 	if (pt_level == 1)
1403 		return XE_PDE_PS_2M;
1404 	else if (pt_level == 2)
1405 		return XE_PDPE_PS_1G;
1406 
1407 	return 0;
1408 }
1409 
1410 static u16 pde_pat_index(struct xe_bo *bo)
1411 {
1412 	struct xe_device *xe = xe_bo_device(bo);
1413 	u16 pat_index;
1414 
1415 	/*
1416 	 * We only have two bits to encode the PAT index in non-leaf nodes, but
1417 	 * these only point to other paging structures so we only need a minimal
1418 	 * selection of options. The user PAT index is only for encoding leaf
1419 	 * nodes, where we have use of more bits to do the encoding. The
1420 	 * non-leaf nodes are instead under driver control so the chosen index
1421 	 * here should be distinct from the user PAT index. Also the
1422 	 * corresponding coherency of the PAT index should be tied to the
1423 	 * allocation type of the page table (or at least we should pick
1424 	 * something which is always safe).
1425 	 */
1426 	if (!xe_bo_is_vram(bo) && bo->ttm.ttm->caching == ttm_cached)
1427 		pat_index = xe_cache_pat_idx(xe, XE_CACHE_WB);
1428 	else
1429 		pat_index = xe_cache_pat_idx(xe, XE_CACHE_NONE);
1430 
1431 	xe_assert(xe, pat_index <= 3);
1432 
1433 	return pat_index;
1434 }
1435 
1436 static u64 xelp_pde_encode_bo(struct xe_bo *bo, u64 bo_offset)
1437 {
1438 	u64 pde;
1439 
1440 	pde = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1441 	pde |= XE_PAGE_PRESENT | XE_PAGE_RW;
1442 	pde |= pde_encode_pat_index(pde_pat_index(bo));
1443 
1444 	return pde;
1445 }
1446 
1447 static u64 xelp_pte_encode_bo(struct xe_bo *bo, u64 bo_offset,
1448 			      u16 pat_index, u32 pt_level)
1449 {
1450 	u64 pte;
1451 
1452 	pte = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1453 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1454 	pte |= pte_encode_pat_index(pat_index, pt_level);
1455 	pte |= pte_encode_ps(pt_level);
1456 
1457 	if (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo))
1458 		pte |= XE_PPGTT_PTE_DM;
1459 
1460 	return pte;
1461 }
1462 
1463 static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma,
1464 			       u16 pat_index, u32 pt_level)
1465 {
1466 	struct xe_bo *bo = xe_vma_bo(vma);
1467 	struct xe_vm *vm = xe_vma_vm(vma);
1468 
1469 	pte |= XE_PAGE_PRESENT;
1470 
1471 	if (likely(!xe_vma_read_only(vma)))
1472 		pte |= XE_PAGE_RW;
1473 
1474 	pte |= pte_encode_pat_index(pat_index, pt_level);
1475 	pte |= pte_encode_ps(pt_level);
1476 
1477 	/*
1478 	 * NULL PTEs redirect to scratch page (return zeros on read).
1479 	 * Set for: 1) explicit null VMAs, 2) purged BOs on scratch VMs.
1480 	 * Never set NULL flag without scratch page - causes undefined behavior.
1481 	 */
1482 	if (unlikely(xe_vma_is_null(vma) ||
1483 		     (bo && xe_bo_is_purged(bo) && xe_vm_has_scratch(vm))))
1484 		pte |= XE_PTE_NULL;
1485 
1486 	return pte;
1487 }
1488 
1489 static u64 xelp_pte_encode_addr(struct xe_device *xe, u64 addr,
1490 				u16 pat_index,
1491 				u32 pt_level, bool devmem, u64 flags)
1492 {
1493 	u64 pte;
1494 
1495 	/* Avoid passing random bits directly as flags */
1496 	xe_assert(xe, !(flags & ~XE_PTE_PS64));
1497 
1498 	pte = addr;
1499 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1500 	pte |= pte_encode_pat_index(pat_index, pt_level);
1501 	pte |= pte_encode_ps(pt_level);
1502 
1503 	if (devmem)
1504 		pte |= XE_PPGTT_PTE_DM;
1505 
1506 	pte |= flags;
1507 
1508 	return pte;
1509 }
1510 
1511 static const struct xe_pt_ops xelp_pt_ops = {
1512 	.pte_encode_bo = xelp_pte_encode_bo,
1513 	.pte_encode_vma = xelp_pte_encode_vma,
1514 	.pte_encode_addr = xelp_pte_encode_addr,
1515 	.pde_encode_bo = xelp_pde_encode_bo,
1516 };
1517 
1518 static void vm_destroy_work_func(struct work_struct *w);
1519 
1520 /**
1521  * xe_vm_create_scratch() - Setup a scratch memory pagetable tree for the
1522  * given tile and vm.
1523  * @xe: xe device.
1524  * @tile: tile to set up for.
1525  * @vm: vm to set up for.
1526  * @exec: The struct drm_exec object used to lock the vm resv.
1527  *
1528  * Sets up a pagetable tree with one page-table per level and a single
1529  * leaf PTE. All pagetable entries point to the single page-table or,
1530  * for MAX_HUGEPTE_LEVEL, a NULL huge PTE returning 0 on read and
1531  * writes become NOPs.
1532  *
1533  * Return: 0 on success, negative error code on error.
1534  */
1535 static int xe_vm_create_scratch(struct xe_device *xe, struct xe_tile *tile,
1536 				struct xe_vm *vm, struct drm_exec *exec)
1537 {
1538 	u8 id = tile->id;
1539 	int i;
1540 
1541 	for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; i++) {
1542 		vm->scratch_pt[id][i] = xe_pt_create(vm, tile, i, exec);
1543 		if (IS_ERR(vm->scratch_pt[id][i])) {
1544 			int err = PTR_ERR(vm->scratch_pt[id][i]);
1545 
1546 			vm->scratch_pt[id][i] = NULL;
1547 			return err;
1548 		}
1549 		xe_pt_populate_empty(tile, vm, vm->scratch_pt[id][i]);
1550 	}
1551 
1552 	return 0;
1553 }
1554 ALLOW_ERROR_INJECTION(xe_vm_create_scratch, ERRNO);
1555 
1556 static void xe_vm_free_scratch(struct xe_vm *vm)
1557 {
1558 	struct xe_tile *tile;
1559 	u8 id;
1560 
1561 	if (!xe_vm_has_scratch(vm))
1562 		return;
1563 
1564 	for_each_tile(tile, vm->xe, id) {
1565 		u32 i;
1566 
1567 		if (!vm->pt_root[id])
1568 			continue;
1569 
1570 		for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; ++i)
1571 			if (vm->scratch_pt[id][i])
1572 				xe_pt_destroy(vm->scratch_pt[id][i], vm->flags, NULL);
1573 	}
1574 }
1575 
1576 static void xe_vm_pt_destroy(struct xe_vm *vm)
1577 {
1578 	struct xe_tile *tile;
1579 	u8 id;
1580 
1581 	xe_vm_assert_held(vm);
1582 
1583 	for_each_tile(tile, vm->xe, id) {
1584 		if (vm->pt_root[id]) {
1585 			xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
1586 			vm->pt_root[id] = NULL;
1587 		}
1588 	}
1589 }
1590 
1591 static void xe_vm_init_prove_locking(struct xe_device *xe, struct xe_vm *vm)
1592 {
1593 	if (!IS_ENABLED(CONFIG_PROVE_LOCKING))
1594 		return;
1595 
1596 	fs_reclaim_acquire(GFP_KERNEL);
1597 	might_lock(&vm->exec_queues.lock);
1598 	fs_reclaim_release(GFP_KERNEL);
1599 
1600 	down_read(&vm->exec_queues.lock);
1601 	might_lock(&xe_root_mmio_gt(xe)->uc.guc.ct.lock);
1602 	up_read(&vm->exec_queues.lock);
1603 }
1604 
1605 struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef)
1606 {
1607 	struct drm_gem_object *vm_resv_obj;
1608 	struct xe_validation_ctx ctx;
1609 	struct drm_exec exec;
1610 	struct xe_vm *vm;
1611 	int err;
1612 	struct xe_tile *tile;
1613 	u8 id;
1614 
1615 	/*
1616 	 * Since the GSCCS is not user-accessible, we don't expect a GSC VM to
1617 	 * ever be in faulting mode.
1618 	 */
1619 	xe_assert(xe, !((flags & XE_VM_FLAG_GSC) && (flags & XE_VM_FLAG_FAULT_MODE)));
1620 
1621 	vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1622 	if (!vm)
1623 		return ERR_PTR(-ENOMEM);
1624 
1625 	vm->xe = xe;
1626 
1627 	vm->size = 1ull << xe->info.va_bits;
1628 	vm->flags = flags;
1629 
1630 	if (xef)
1631 		vm->xef = xe_file_get(xef);
1632 	/**
1633 	 * GSC VMs are kernel-owned, only used for PXP ops and can sometimes be
1634 	 * manipulated under the PXP mutex. However, the PXP mutex can be taken
1635 	 * under a user-VM lock when the PXP session is started at exec_queue
1636 	 * creation time. Those are different VMs and therefore there is no risk
1637 	 * of deadlock, but we need to tell lockdep that this is the case or it
1638 	 * will print a warning.
1639 	 */
1640 	if (flags & XE_VM_FLAG_GSC) {
1641 		static struct lock_class_key gsc_vm_key;
1642 
1643 		__init_rwsem(&vm->lock, "gsc_vm", &gsc_vm_key);
1644 	} else {
1645 		init_rwsem(&vm->lock);
1646 	}
1647 	mutex_init(&vm->snap_mutex);
1648 
1649 	INIT_LIST_HEAD(&vm->rebind_list);
1650 
1651 	INIT_LIST_HEAD(&vm->userptr.repin_list);
1652 	INIT_LIST_HEAD(&vm->userptr.invalidated);
1653 	spin_lock_init(&vm->userptr.invalidated_lock);
1654 
1655 	INIT_LIST_HEAD(&vm->faults.list);
1656 	spin_lock_init(&vm->faults.lock);
1657 
1658 	ttm_lru_bulk_move_init(&vm->lru_bulk_move);
1659 
1660 	INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
1661 
1662 	INIT_LIST_HEAD(&vm->preempt.exec_queues);
1663 	for (id = 0; id < XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE; ++id)
1664 		INIT_LIST_HEAD(&vm->exec_queues.list[id]);
1665 	if (flags & XE_VM_FLAG_FAULT_MODE)
1666 		vm->preempt.min_run_period_ms = xe->min_run_period_pf_ms;
1667 	else
1668 		vm->preempt.min_run_period_ms = xe->min_run_period_lr_ms;
1669 
1670 	init_rwsem(&vm->exec_queues.lock);
1671 	xe_vm_init_prove_locking(xe, vm);
1672 
1673 	for_each_tile(tile, xe, id)
1674 		xe_range_fence_tree_init(&vm->rftree[id]);
1675 
1676 	vm->pt_ops = &xelp_pt_ops;
1677 
1678 	/*
1679 	 * Long-running workloads are not protected by the scheduler references.
1680 	 * By design, run_job for long-running workloads returns NULL and the
1681 	 * scheduler drops all the references of it, hence protecting the VM
1682 	 * for this case is necessary.
1683 	 */
1684 	if (flags & XE_VM_FLAG_LR_MODE) {
1685 		INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1686 		xe_pm_runtime_get_noresume(xe);
1687 		INIT_LIST_HEAD(&vm->preempt.pm_activate_link);
1688 	}
1689 
1690 	err = xe_svm_init(vm);
1691 	if (err)
1692 		goto err_no_resv;
1693 
1694 	vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
1695 	if (!vm_resv_obj) {
1696 		err = -ENOMEM;
1697 		goto err_svm_fini;
1698 	}
1699 
1700 	drm_gpuvm_init(&vm->gpuvm, "Xe VM", DRM_GPUVM_RESV_PROTECTED, &xe->drm,
1701 		       vm_resv_obj, 0, vm->size, 0, 0, &gpuvm_ops);
1702 
1703 	drm_gem_object_put(vm_resv_obj);
1704 
1705 	err = 0;
1706 	xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {.interruptible = true},
1707 			    err) {
1708 		err = xe_vm_drm_exec_lock(vm, &exec);
1709 		drm_exec_retry_on_contention(&exec);
1710 
1711 		if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
1712 			vm->flags |= XE_VM_FLAG_64K;
1713 
1714 		for_each_tile(tile, xe, id) {
1715 			if (flags & XE_VM_FLAG_MIGRATION &&
1716 			    tile->id != XE_VM_FLAG_TILE_ID(flags))
1717 				continue;
1718 
1719 			vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level,
1720 						       &exec);
1721 			if (IS_ERR(vm->pt_root[id])) {
1722 				err = PTR_ERR(vm->pt_root[id]);
1723 				vm->pt_root[id] = NULL;
1724 				xe_vm_pt_destroy(vm);
1725 				drm_exec_retry_on_contention(&exec);
1726 				xe_validation_retry_on_oom(&ctx, &err);
1727 				break;
1728 			}
1729 		}
1730 		if (err)
1731 			break;
1732 
1733 		if (xe_vm_has_scratch(vm)) {
1734 			for_each_tile(tile, xe, id) {
1735 				if (!vm->pt_root[id])
1736 					continue;
1737 
1738 				err = xe_vm_create_scratch(xe, tile, vm, &exec);
1739 				if (err) {
1740 					xe_vm_free_scratch(vm);
1741 					xe_vm_pt_destroy(vm);
1742 					drm_exec_retry_on_contention(&exec);
1743 					xe_validation_retry_on_oom(&ctx, &err);
1744 					break;
1745 				}
1746 			}
1747 			if (err)
1748 				break;
1749 			vm->batch_invalidate_tlb = true;
1750 		}
1751 
1752 		if (vm->flags & XE_VM_FLAG_LR_MODE) {
1753 			INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1754 			vm->batch_invalidate_tlb = false;
1755 		}
1756 
1757 		/* Fill pt_root after allocating scratch tables */
1758 		for_each_tile(tile, xe, id) {
1759 			if (!vm->pt_root[id])
1760 				continue;
1761 
1762 			xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
1763 		}
1764 	}
1765 	if (err)
1766 		goto err_close;
1767 
1768 	/* Kernel migration VM shouldn't have a circular loop.. */
1769 	if (!(flags & XE_VM_FLAG_MIGRATION)) {
1770 		for_each_tile(tile, xe, id) {
1771 			struct xe_exec_queue *q;
1772 			u32 create_flags = EXEC_QUEUE_FLAG_VM;
1773 
1774 			if (!vm->pt_root[id])
1775 				continue;
1776 
1777 			if (!xef) /* Not from userspace */
1778 				create_flags |= EXEC_QUEUE_FLAG_KERNEL;
1779 
1780 			q = xe_exec_queue_create_bind(xe, tile, vm, create_flags, 0);
1781 			if (IS_ERR(q)) {
1782 				err = PTR_ERR(q);
1783 				goto err_close;
1784 			}
1785 			vm->q[id] = q;
1786 		}
1787 	}
1788 
1789 	if (xef && xe->info.has_asid) {
1790 		u32 asid;
1791 
1792 		down_write(&xe->usm.lock);
1793 		err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
1794 				      XA_LIMIT(1, XE_MAX_ASID - 1),
1795 				      &xe->usm.next_asid, GFP_NOWAIT);
1796 		up_write(&xe->usm.lock);
1797 		if (err < 0)
1798 			goto err_close;
1799 
1800 		vm->usm.asid = asid;
1801 	}
1802 
1803 	trace_xe_vm_create(vm);
1804 
1805 	return vm;
1806 
1807 err_close:
1808 	xe_vm_close_and_put(vm);
1809 	return ERR_PTR(err);
1810 
1811 err_svm_fini:
1812 	if (flags & XE_VM_FLAG_FAULT_MODE) {
1813 		vm->size = 0; /* close the vm */
1814 		xe_svm_fini(vm);
1815 	}
1816 err_no_resv:
1817 	mutex_destroy(&vm->snap_mutex);
1818 	for_each_tile(tile, xe, id)
1819 		xe_range_fence_tree_fini(&vm->rftree[id]);
1820 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
1821 	if (vm->xef)
1822 		xe_file_put(vm->xef);
1823 	kfree(vm);
1824 	if (flags & XE_VM_FLAG_LR_MODE)
1825 		xe_pm_runtime_put(xe);
1826 	return ERR_PTR(err);
1827 }
1828 
1829 static void xe_vm_close(struct xe_vm *vm)
1830 {
1831 	struct xe_device *xe = vm->xe;
1832 	bool bound;
1833 	int idx;
1834 
1835 	bound = drm_dev_enter(&xe->drm, &idx);
1836 
1837 	down_write(&vm->lock);
1838 	if (xe_vm_in_fault_mode(vm))
1839 		xe_svm_notifier_lock(vm);
1840 
1841 	vm->size = 0;
1842 
1843 	if (!((vm->flags & XE_VM_FLAG_MIGRATION))) {
1844 		struct xe_tile *tile;
1845 		struct xe_gt *gt;
1846 		u8 id;
1847 
1848 		/* Wait for pending binds */
1849 		dma_resv_wait_timeout(xe_vm_resv(vm),
1850 				      DMA_RESV_USAGE_BOOKKEEP,
1851 				      false, MAX_SCHEDULE_TIMEOUT);
1852 
1853 		if (bound) {
1854 			for_each_tile(tile, xe, id)
1855 				if (vm->pt_root[id])
1856 					xe_pt_clear(xe, vm->pt_root[id]);
1857 
1858 			for_each_gt(gt, xe, id)
1859 				xe_tlb_inval_vm(&gt->tlb_inval, vm);
1860 		}
1861 	}
1862 
1863 	if (xe_vm_in_fault_mode(vm))
1864 		xe_svm_notifier_unlock(vm);
1865 	up_write(&vm->lock);
1866 
1867 	if (bound)
1868 		drm_dev_exit(idx);
1869 }
1870 
1871 void xe_vm_close_and_put(struct xe_vm *vm)
1872 {
1873 	LIST_HEAD(contested);
1874 	struct xe_device *xe = vm->xe;
1875 	struct xe_tile *tile;
1876 	struct xe_vma *vma, *next_vma;
1877 	struct drm_gpuva *gpuva, *next;
1878 	u8 id;
1879 
1880 	xe_assert(xe, !vm->preempt.num_exec_queues);
1881 
1882 	xe_vm_close(vm);
1883 	if (xe_vm_in_preempt_fence_mode(vm)) {
1884 		mutex_lock(&xe->rebind_resume_lock);
1885 		list_del_init(&vm->preempt.pm_activate_link);
1886 		mutex_unlock(&xe->rebind_resume_lock);
1887 		flush_work(&vm->preempt.rebind_work);
1888 	}
1889 	if (xe_vm_in_fault_mode(vm))
1890 		xe_svm_close(vm);
1891 
1892 	down_write(&vm->lock);
1893 	for_each_tile(tile, xe, id) {
1894 		if (vm->q[id]) {
1895 			int i;
1896 
1897 			xe_exec_queue_last_fence_put(vm->q[id], vm);
1898 			for_each_tlb_inval(i)
1899 				xe_exec_queue_tlb_inval_last_fence_put(vm->q[id], vm, i);
1900 		}
1901 	}
1902 	up_write(&vm->lock);
1903 
1904 	for_each_tile(tile, xe, id) {
1905 		if (vm->q[id]) {
1906 			xe_exec_queue_kill(vm->q[id]);
1907 			xe_exec_queue_put(vm->q[id]);
1908 			vm->q[id] = NULL;
1909 		}
1910 	}
1911 
1912 	down_write(&vm->lock);
1913 	xe_vm_lock(vm, false);
1914 	drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
1915 		vma = gpuva_to_vma(gpuva);
1916 
1917 		if (xe_vma_has_no_bo(vma)) {
1918 			xe_svm_notifier_lock(vm);
1919 			vma->gpuva.flags |= XE_VMA_DESTROYED;
1920 			xe_svm_notifier_unlock(vm);
1921 		}
1922 
1923 		xe_vm_remove_vma(vm, vma);
1924 
1925 		/* easy case, remove from VMA? */
1926 		if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
1927 			list_del_init(&vma->combined_links.rebind);
1928 			xe_vma_destroy(vma, NULL);
1929 			continue;
1930 		}
1931 
1932 		list_move_tail(&vma->combined_links.destroy, &contested);
1933 		vma->gpuva.flags |= XE_VMA_DESTROYED;
1934 	}
1935 
1936 	/*
1937 	 * All vm operations will add shared fences to resv.
1938 	 * The only exception is eviction for a shared object,
1939 	 * but even so, the unbind when evicted would still
1940 	 * install a fence to resv. Hence it's safe to
1941 	 * destroy the pagetables immediately.
1942 	 */
1943 	xe_vm_free_scratch(vm);
1944 	xe_vm_pt_destroy(vm);
1945 	xe_vm_unlock(vm);
1946 
1947 	/*
1948 	 * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
1949 	 * Since we hold a refcount to the bo, we can remove and free
1950 	 * the members safely without locking.
1951 	 */
1952 	list_for_each_entry_safe(vma, next_vma, &contested,
1953 				 combined_links.destroy) {
1954 		list_del_init(&vma->combined_links.destroy);
1955 		xe_vma_destroy_unlocked(vma);
1956 	}
1957 
1958 	xe_svm_fini(vm);
1959 
1960 	up_write(&vm->lock);
1961 
1962 	down_write(&xe->usm.lock);
1963 	if (vm->usm.asid) {
1964 		void *lookup;
1965 
1966 		xe_assert(xe, xe->info.has_asid);
1967 		xe_assert(xe, !(vm->flags & XE_VM_FLAG_MIGRATION));
1968 
1969 		lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
1970 		xe_assert(xe, lookup == vm);
1971 	}
1972 	up_write(&xe->usm.lock);
1973 
1974 	xe_vm_clear_fault_entries(vm);
1975 
1976 	for_each_tile(tile, xe, id)
1977 		xe_range_fence_tree_fini(&vm->rftree[id]);
1978 
1979 	xe_vm_put(vm);
1980 }
1981 
1982 static void vm_destroy_work_func(struct work_struct *w)
1983 {
1984 	struct xe_vm *vm =
1985 		container_of(w, struct xe_vm, destroy_work);
1986 	struct xe_device *xe = vm->xe;
1987 	struct xe_tile *tile;
1988 	u8 id;
1989 
1990 	/* xe_vm_close_and_put was not called? */
1991 	xe_assert(xe, !vm->size);
1992 
1993 	if (xe_vm_in_preempt_fence_mode(vm))
1994 		flush_work(&vm->preempt.rebind_work);
1995 
1996 	mutex_destroy(&vm->snap_mutex);
1997 
1998 	if (vm->flags & XE_VM_FLAG_LR_MODE)
1999 		xe_pm_runtime_put(xe);
2000 
2001 	for_each_tile(tile, xe, id)
2002 		XE_WARN_ON(vm->pt_root[id]);
2003 
2004 	trace_xe_vm_free(vm);
2005 
2006 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
2007 
2008 	if (vm->xef)
2009 		xe_file_put(vm->xef);
2010 
2011 	kfree(vm);
2012 }
2013 
2014 static void xe_vm_free(struct drm_gpuvm *gpuvm)
2015 {
2016 	struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
2017 
2018 	/* To destroy the VM we need to be able to sleep */
2019 	queue_work(system_dfl_wq, &vm->destroy_work);
2020 }
2021 
2022 struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
2023 {
2024 	struct xe_vm *vm;
2025 
2026 	mutex_lock(&xef->vm.lock);
2027 	vm = xa_load(&xef->vm.xa, id);
2028 	if (vm)
2029 		xe_vm_get(vm);
2030 	mutex_unlock(&xef->vm.lock);
2031 
2032 	return vm;
2033 }
2034 
2035 u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
2036 {
2037 	return vm->pt_ops->pde_encode_bo(vm->pt_root[tile->id]->bo, 0);
2038 }
2039 
2040 static struct xe_exec_queue *
2041 to_wait_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
2042 {
2043 	return q ? q : vm->q[0];
2044 }
2045 
2046 static struct xe_user_fence *
2047 find_ufence_get(struct xe_sync_entry *syncs, u32 num_syncs)
2048 {
2049 	unsigned int i;
2050 
2051 	for (i = 0; i < num_syncs; i++) {
2052 		struct xe_sync_entry *e = &syncs[i];
2053 
2054 		if (xe_sync_is_ufence(e))
2055 			return xe_sync_ufence_get(e);
2056 	}
2057 
2058 	return NULL;
2059 }
2060 
2061 #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
2062 				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
2063 				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE | \
2064 				    DRM_XE_VM_CREATE_FLAG_NO_VM_OVERCOMMIT)
2065 
2066 int xe_vm_create_ioctl(struct drm_device *dev, void *data,
2067 		       struct drm_file *file)
2068 {
2069 	struct xe_device *xe = to_xe_device(dev);
2070 	struct xe_file *xef = to_xe_file(file);
2071 	struct drm_xe_vm_create *args = data;
2072 	struct xe_gt *wa_gt = xe_root_mmio_gt(xe);
2073 	struct xe_vm *vm;
2074 	u32 id;
2075 	int err;
2076 	u32 flags = 0;
2077 
2078 	if (XE_IOCTL_DBG(xe, args->extensions))
2079 		return -EINVAL;
2080 
2081 	if (wa_gt && XE_GT_WA(wa_gt, 22014953428))
2082 		args->flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE;
2083 
2084 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
2085 			 !xe->info.has_usm))
2086 		return -EINVAL;
2087 
2088 	if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2089 		return -EINVAL;
2090 
2091 	if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
2092 		return -EINVAL;
2093 
2094 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE &&
2095 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
2096 			 !xe->info.needs_scratch))
2097 		return -EINVAL;
2098 
2099 	if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE) &&
2100 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE))
2101 		return -EINVAL;
2102 
2103 	if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE) &&
2104 			 args->flags & DRM_XE_VM_CREATE_FLAG_NO_VM_OVERCOMMIT))
2105 		return -EINVAL;
2106 
2107 	if (args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE)
2108 		flags |= XE_VM_FLAG_SCRATCH_PAGE;
2109 	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
2110 		flags |= XE_VM_FLAG_LR_MODE;
2111 	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
2112 		flags |= XE_VM_FLAG_FAULT_MODE;
2113 	if (args->flags & DRM_XE_VM_CREATE_FLAG_NO_VM_OVERCOMMIT)
2114 		flags |= XE_VM_FLAG_NO_VM_OVERCOMMIT;
2115 
2116 	vm = xe_vm_create(xe, flags, xef);
2117 	if (IS_ERR(vm))
2118 		return PTR_ERR(vm);
2119 
2120 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
2121 	/* Warning: Security issue - never enable by default */
2122 	args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
2123 #endif
2124 
2125 	/* user id alloc must always be last in ioctl to prevent UAF */
2126 	err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
2127 	if (err)
2128 		goto err_close_and_put;
2129 
2130 	args->vm_id = id;
2131 
2132 	return 0;
2133 
2134 err_close_and_put:
2135 	xe_vm_close_and_put(vm);
2136 
2137 	return err;
2138 }
2139 
2140 int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
2141 			struct drm_file *file)
2142 {
2143 	struct xe_device *xe = to_xe_device(dev);
2144 	struct xe_file *xef = to_xe_file(file);
2145 	struct drm_xe_vm_destroy *args = data;
2146 	struct xe_vm *vm;
2147 	int err = 0;
2148 
2149 	if (XE_IOCTL_DBG(xe, args->pad) ||
2150 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2151 		return -EINVAL;
2152 
2153 	mutex_lock(&xef->vm.lock);
2154 	vm = xa_load(&xef->vm.xa, args->vm_id);
2155 	if (XE_IOCTL_DBG(xe, !vm))
2156 		err = -ENOENT;
2157 	else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
2158 		err = -EBUSY;
2159 	else
2160 		xa_erase(&xef->vm.xa, args->vm_id);
2161 	mutex_unlock(&xef->vm.lock);
2162 
2163 	if (!err)
2164 		xe_vm_close_and_put(vm);
2165 
2166 	return err;
2167 }
2168 
2169 static int xe_vm_query_vmas(struct xe_vm *vm, u64 start, u64 end)
2170 {
2171 	struct drm_gpuva *gpuva;
2172 	u32 num_vmas = 0;
2173 
2174 	lockdep_assert_held(&vm->lock);
2175 	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end)
2176 		num_vmas++;
2177 
2178 	return num_vmas;
2179 }
2180 
2181 static int get_mem_attrs(struct xe_vm *vm, u32 *num_vmas, u64 start,
2182 			 u64 end, struct drm_xe_mem_range_attr *attrs)
2183 {
2184 	struct drm_gpuva *gpuva;
2185 	int i = 0;
2186 
2187 	lockdep_assert_held(&vm->lock);
2188 
2189 	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end) {
2190 		struct xe_vma *vma = gpuva_to_vma(gpuva);
2191 
2192 		if (i == *num_vmas)
2193 			return -ENOSPC;
2194 
2195 		attrs[i].start = xe_vma_start(vma);
2196 		attrs[i].end = xe_vma_end(vma);
2197 		attrs[i].atomic.val = vma->attr.atomic_access;
2198 		attrs[i].pat_index.val = vma->attr.pat_index;
2199 		attrs[i].preferred_mem_loc.devmem_fd = vma->attr.preferred_loc.devmem_fd;
2200 		attrs[i].preferred_mem_loc.migration_policy =
2201 		vma->attr.preferred_loc.migration_policy;
2202 
2203 		i++;
2204 	}
2205 
2206 	*num_vmas = i;
2207 	return 0;
2208 }
2209 
2210 int xe_vm_query_vmas_attrs_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
2211 {
2212 	struct xe_device *xe = to_xe_device(dev);
2213 	struct xe_file *xef = to_xe_file(file);
2214 	struct drm_xe_mem_range_attr *mem_attrs;
2215 	struct drm_xe_vm_query_mem_range_attr *args = data;
2216 	u64 __user *attrs_user = u64_to_user_ptr(args->vector_of_mem_attr);
2217 	struct xe_vm *vm;
2218 	int err = 0;
2219 
2220 	if (XE_IOCTL_DBG(xe,
2221 			 ((args->num_mem_ranges == 0 &&
2222 			  (attrs_user || args->sizeof_mem_range_attr != 0)) ||
2223 			 (args->num_mem_ranges > 0 &&
2224 			  (!attrs_user ||
2225 			   args->sizeof_mem_range_attr !=
2226 			   sizeof(struct drm_xe_mem_range_attr))))))
2227 		return -EINVAL;
2228 
2229 	vm = xe_vm_lookup(xef, args->vm_id);
2230 	if (XE_IOCTL_DBG(xe, !vm))
2231 		return -EINVAL;
2232 
2233 	err = down_read_interruptible(&vm->lock);
2234 	if (err)
2235 		goto put_vm;
2236 
2237 	attrs_user = u64_to_user_ptr(args->vector_of_mem_attr);
2238 
2239 	if (args->num_mem_ranges == 0 && !attrs_user) {
2240 		args->num_mem_ranges = xe_vm_query_vmas(vm, args->start, args->start + args->range);
2241 		args->sizeof_mem_range_attr = sizeof(struct drm_xe_mem_range_attr);
2242 		goto unlock_vm;
2243 	}
2244 
2245 	mem_attrs = kvmalloc_array(args->num_mem_ranges, args->sizeof_mem_range_attr,
2246 				   GFP_KERNEL | __GFP_ACCOUNT |
2247 				   __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
2248 	if (!mem_attrs) {
2249 		err = args->num_mem_ranges > 1 ? -ENOBUFS : -ENOMEM;
2250 		goto unlock_vm;
2251 	}
2252 
2253 	memset(mem_attrs, 0, args->num_mem_ranges * args->sizeof_mem_range_attr);
2254 	err = get_mem_attrs(vm, &args->num_mem_ranges, args->start,
2255 			    args->start + args->range, mem_attrs);
2256 	if (err)
2257 		goto free_mem_attrs;
2258 
2259 	err = copy_to_user(attrs_user, mem_attrs,
2260 			   args->sizeof_mem_range_attr * args->num_mem_ranges);
2261 	if (err)
2262 		err = -EFAULT;
2263 
2264 free_mem_attrs:
2265 	kvfree(mem_attrs);
2266 unlock_vm:
2267 	up_read(&vm->lock);
2268 put_vm:
2269 	xe_vm_put(vm);
2270 	return err;
2271 }
2272 
2273 static bool vma_matches(struct xe_vma *vma, u64 page_addr)
2274 {
2275 	if (page_addr > xe_vma_end(vma) - 1 ||
2276 	    page_addr + SZ_4K - 1 < xe_vma_start(vma))
2277 		return false;
2278 
2279 	return true;
2280 }
2281 
2282 /**
2283  * xe_vm_find_vma_by_addr() - Find a VMA by its address
2284  *
2285  * @vm: the xe_vm the vma belongs to
2286  * @page_addr: address to look up
2287  */
2288 struct xe_vma *xe_vm_find_vma_by_addr(struct xe_vm *vm, u64 page_addr)
2289 {
2290 	struct xe_vma *vma = NULL;
2291 
2292 	if (vm->usm.last_fault_vma) {   /* Fast lookup */
2293 		if (vma_matches(vm->usm.last_fault_vma, page_addr))
2294 			vma = vm->usm.last_fault_vma;
2295 	}
2296 	if (!vma)
2297 		vma = xe_vm_find_overlapping_vma(vm, page_addr, SZ_4K);
2298 
2299 	return vma;
2300 }
2301 
2302 static const u32 region_to_mem_type[] = {
2303 	XE_PL_TT,
2304 	XE_PL_VRAM0,
2305 	XE_PL_VRAM1,
2306 };
2307 
2308 static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
2309 			     bool post_commit)
2310 {
2311 	xe_svm_notifier_lock(vm);
2312 	vma->gpuva.flags |= XE_VMA_DESTROYED;
2313 	xe_svm_notifier_unlock(vm);
2314 	if (post_commit)
2315 		xe_vm_remove_vma(vm, vma);
2316 }
2317 
2318 #undef ULL
2319 #define ULL	unsigned long long
2320 
2321 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
2322 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2323 {
2324 	struct xe_vma *vma;
2325 
2326 	switch (op->op) {
2327 	case DRM_GPUVA_OP_MAP:
2328 		vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
2329 		       (ULL)op->map.va.addr, (ULL)op->map.va.range);
2330 		break;
2331 	case DRM_GPUVA_OP_REMAP:
2332 		vma = gpuva_to_vma(op->remap.unmap->va);
2333 		vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2334 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2335 		       op->remap.unmap->keep ? 1 : 0);
2336 		if (op->remap.prev)
2337 			vm_dbg(&xe->drm,
2338 			       "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
2339 			       (ULL)op->remap.prev->va.addr,
2340 			       (ULL)op->remap.prev->va.range);
2341 		if (op->remap.next)
2342 			vm_dbg(&xe->drm,
2343 			       "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
2344 			       (ULL)op->remap.next->va.addr,
2345 			       (ULL)op->remap.next->va.range);
2346 		break;
2347 	case DRM_GPUVA_OP_UNMAP:
2348 		vma = gpuva_to_vma(op->unmap.va);
2349 		vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2350 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2351 		       op->unmap.keep ? 1 : 0);
2352 		break;
2353 	case DRM_GPUVA_OP_PREFETCH:
2354 		vma = gpuva_to_vma(op->prefetch.va);
2355 		vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
2356 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
2357 		break;
2358 	default:
2359 		drm_warn(&xe->drm, "NOT POSSIBLE\n");
2360 	}
2361 }
2362 #else
2363 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2364 {
2365 }
2366 #endif
2367 
2368 static bool __xe_vm_needs_clear_scratch_pages(struct xe_vm *vm, u32 bind_flags)
2369 {
2370 	if (!xe_vm_in_fault_mode(vm))
2371 		return false;
2372 
2373 	if (!xe_vm_has_scratch(vm))
2374 		return false;
2375 
2376 	if (bind_flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE)
2377 		return false;
2378 
2379 	return true;
2380 }
2381 
2382 static void xe_svm_prefetch_gpuva_ops_fini(struct drm_gpuva_ops *ops)
2383 {
2384 	struct drm_gpuva_op *__op;
2385 
2386 	drm_gpuva_for_each_op(__op, ops) {
2387 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2388 
2389 		xe_vma_svm_prefetch_op_fini(op);
2390 	}
2391 }
2392 
2393 /*
2394  * Create operations list from IOCTL arguments, setup operations fields so parse
2395  * and commit steps are decoupled from IOCTL arguments. This step can fail.
2396  */
2397 static struct drm_gpuva_ops *
2398 vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_vma_ops *vops,
2399 			 struct xe_bo *bo, u64 bo_offset_or_userptr,
2400 			 u64 addr, u64 range,
2401 			 u32 operation, u32 flags,
2402 			 u32 prefetch_region, u16 pat_index)
2403 {
2404 	struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
2405 	struct drm_gpuva_ops *ops;
2406 	struct drm_gpuva_op *__op;
2407 	struct drm_gpuvm_bo *vm_bo;
2408 	u64 range_start = addr;
2409 	u64 range_end = addr + range;
2410 	int err;
2411 
2412 	lockdep_assert_held_write(&vm->lock);
2413 
2414 	vm_dbg(&vm->xe->drm,
2415 	       "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
2416 	       operation, (ULL)addr, (ULL)range,
2417 	       (ULL)bo_offset_or_userptr);
2418 
2419 	switch (operation) {
2420 	case DRM_XE_VM_BIND_OP_MAP:
2421 		if (flags & DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR) {
2422 			xe_vm_find_cpu_addr_mirror_vma_range(vm, &range_start, &range_end);
2423 			vops->flags |= XE_VMA_OPS_FLAG_ALLOW_SVM_UNMAP;
2424 		}
2425 
2426 		fallthrough;
2427 	case DRM_XE_VM_BIND_OP_MAP_USERPTR: {
2428 		struct drm_gpuvm_map_req map_req = {
2429 			.map.va.addr = range_start,
2430 			.map.va.range = range_end - range_start,
2431 			.map.gem.obj = obj,
2432 			.map.gem.offset = bo_offset_or_userptr,
2433 		};
2434 
2435 		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, &map_req);
2436 		break;
2437 	}
2438 	case DRM_XE_VM_BIND_OP_UNMAP:
2439 		ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
2440 		break;
2441 	case DRM_XE_VM_BIND_OP_PREFETCH:
2442 		ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
2443 		break;
2444 	case DRM_XE_VM_BIND_OP_UNMAP_ALL:
2445 		xe_assert(vm->xe, bo);
2446 
2447 		err = xe_bo_lock(bo, true);
2448 		if (err)
2449 			return ERR_PTR(err);
2450 
2451 		vm_bo = drm_gpuvm_bo_obtain_locked(&vm->gpuvm, obj);
2452 		if (IS_ERR(vm_bo)) {
2453 			xe_bo_unlock(bo);
2454 			return ERR_CAST(vm_bo);
2455 		}
2456 
2457 		ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
2458 		drm_gpuvm_bo_put(vm_bo);
2459 		xe_bo_unlock(bo);
2460 		break;
2461 	default:
2462 		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
2463 		ops = ERR_PTR(-EINVAL);
2464 	}
2465 	if (IS_ERR(ops))
2466 		return ops;
2467 
2468 	drm_gpuva_for_each_op(__op, ops) {
2469 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2470 
2471 		if (__op->op == DRM_GPUVA_OP_MAP) {
2472 			op->map.immediate =
2473 				flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE;
2474 			if (flags & DRM_XE_VM_BIND_FLAG_READONLY)
2475 				op->map.vma_flags |= XE_VMA_READ_ONLY;
2476 			if (flags & DRM_XE_VM_BIND_FLAG_NULL)
2477 				op->map.vma_flags |= DRM_GPUVA_SPARSE;
2478 			if (flags & DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR)
2479 				op->map.vma_flags |= XE_VMA_SYSTEM_ALLOCATOR;
2480 			if (flags & DRM_XE_VM_BIND_FLAG_DUMPABLE)
2481 				op->map.vma_flags |= XE_VMA_DUMPABLE;
2482 			if (flags & DRM_XE_VM_BIND_FLAG_MADVISE_AUTORESET)
2483 				op->map.vma_flags |= XE_VMA_MADV_AUTORESET;
2484 			op->map.request_decompress = flags & DRM_XE_VM_BIND_FLAG_DECOMPRESS;
2485 			op->map.pat_index = pat_index;
2486 			op->map.invalidate_on_bind =
2487 				__xe_vm_needs_clear_scratch_pages(vm, flags);
2488 		} else if (__op->op == DRM_GPUVA_OP_PREFETCH) {
2489 			struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
2490 			struct xe_tile *tile;
2491 			struct xe_svm_range *svm_range;
2492 			struct drm_gpusvm_ctx ctx = {};
2493 			struct drm_pagemap *dpagemap = NULL;
2494 			u8 id, tile_mask = 0;
2495 			u32 i;
2496 
2497 			if (!xe_vma_is_cpu_addr_mirror(vma)) {
2498 				op->prefetch.region = prefetch_region;
2499 				break;
2500 			}
2501 
2502 			ctx.read_only = xe_vma_read_only(vma);
2503 			ctx.devmem_possible = IS_DGFX(vm->xe) &&
2504 					      IS_ENABLED(CONFIG_DRM_XE_PAGEMAP);
2505 
2506 			for_each_tile(tile, vm->xe, id)
2507 				tile_mask |= 0x1 << id;
2508 
2509 			xa_init_flags(&op->prefetch_range.range, XA_FLAGS_ALLOC);
2510 			op->prefetch_range.ranges_count = 0;
2511 
2512 			if (prefetch_region == DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC) {
2513 				dpagemap = xe_vma_resolve_pagemap(vma,
2514 								  xe_device_get_root_tile(vm->xe));
2515 			} else if (prefetch_region) {
2516 				tile = &vm->xe->tiles[region_to_mem_type[prefetch_region] -
2517 						      XE_PL_VRAM0];
2518 				dpagemap = xe_tile_local_pagemap(tile);
2519 			}
2520 
2521 			op->prefetch_range.dpagemap = dpagemap;
2522 alloc_next_range:
2523 			svm_range = xe_svm_range_find_or_insert(vm, addr, vma, &ctx);
2524 
2525 			if (PTR_ERR(svm_range) == -ENOENT) {
2526 				u64 ret = xe_svm_find_vma_start(vm, addr, range_end, vma);
2527 
2528 				addr = ret == ULONG_MAX ? 0 : ret;
2529 				if (addr)
2530 					goto alloc_next_range;
2531 				else
2532 					goto print_op_label;
2533 			}
2534 
2535 			if (IS_ERR(svm_range)) {
2536 				err = PTR_ERR(svm_range);
2537 				goto unwind_prefetch_ops;
2538 			}
2539 
2540 			if (xe_svm_range_validate(vm, svm_range, tile_mask, dpagemap)) {
2541 				xe_svm_range_debug(svm_range, "PREFETCH - RANGE IS VALID");
2542 				goto check_next_range;
2543 			}
2544 
2545 			err = xa_alloc(&op->prefetch_range.range,
2546 				       &i, svm_range, xa_limit_32b,
2547 				       GFP_KERNEL);
2548 
2549 			if (err)
2550 				goto unwind_prefetch_ops;
2551 
2552 			op->prefetch_range.ranges_count++;
2553 			vops->flags |= XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH;
2554 			xe_svm_range_debug(svm_range, "PREFETCH - RANGE CREATED");
2555 check_next_range:
2556 			if (range_end > xe_svm_range_end(svm_range) &&
2557 			    xe_svm_range_end(svm_range) < xe_vma_end(vma)) {
2558 				addr = xe_svm_range_end(svm_range);
2559 				goto alloc_next_range;
2560 			}
2561 		}
2562 print_op_label:
2563 		print_op(vm->xe, __op);
2564 	}
2565 
2566 	return ops;
2567 
2568 unwind_prefetch_ops:
2569 	xe_svm_prefetch_gpuva_ops_fini(ops);
2570 	drm_gpuva_ops_free(&vm->gpuvm, ops);
2571 	return ERR_PTR(err);
2572 }
2573 
2574 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_create, ERRNO);
2575 
2576 static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
2577 			      struct xe_vma_mem_attr *attr, unsigned int flags)
2578 {
2579 	struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2580 	struct xe_validation_ctx ctx;
2581 	struct drm_exec exec;
2582 	struct xe_vma *vma;
2583 	int err = 0;
2584 
2585 	lockdep_assert_held_write(&vm->lock);
2586 
2587 	if (bo) {
2588 		err = 0;
2589 		xe_validation_guard(&ctx, &vm->xe->val, &exec,
2590 				    (struct xe_val_flags) {.interruptible = true}, err) {
2591 			if (!bo->vm) {
2592 				err = drm_exec_lock_obj(&exec, xe_vm_obj(vm));
2593 				drm_exec_retry_on_contention(&exec);
2594 			}
2595 			if (!err) {
2596 				err = drm_exec_lock_obj(&exec, &bo->ttm.base);
2597 				drm_exec_retry_on_contention(&exec);
2598 			}
2599 			if (err)
2600 				return ERR_PTR(err);
2601 
2602 			vma = xe_vma_create(vm, bo, op->gem.offset,
2603 					    op->va.addr, op->va.addr +
2604 					    op->va.range - 1, attr, flags);
2605 			if (IS_ERR(vma))
2606 				return vma;
2607 
2608 			if (!bo->vm) {
2609 				err = add_preempt_fences(vm, bo);
2610 				if (err) {
2611 					prep_vma_destroy(vm, vma, false);
2612 					xe_vma_destroy(vma, NULL);
2613 				}
2614 			}
2615 		}
2616 		if (err)
2617 			return ERR_PTR(err);
2618 	} else {
2619 		vma = xe_vma_create(vm, NULL, op->gem.offset,
2620 				    op->va.addr, op->va.addr +
2621 				    op->va.range - 1, attr, flags);
2622 		if (IS_ERR(vma))
2623 			return vma;
2624 
2625 		if (xe_vma_is_userptr(vma)) {
2626 			err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2627 			/*
2628 			 * -EBUSY has dedicated meaning that a user fence
2629 			 * attached to the VMA is busy, in practice
2630 			 * xe_vma_userptr_pin_pages can only fail with -EBUSY if
2631 			 * we are low on memory so convert this to -ENOMEM.
2632 			 */
2633 			if (err == -EBUSY)
2634 				err = -ENOMEM;
2635 		}
2636 	}
2637 	if (err) {
2638 		prep_vma_destroy(vm, vma, false);
2639 		xe_vma_destroy_unlocked(vma);
2640 		vma = ERR_PTR(err);
2641 	}
2642 
2643 	return vma;
2644 }
2645 
2646 static u64 xe_vma_max_pte_size(struct xe_vma *vma)
2647 {
2648 	if (vma->gpuva.flags & XE_VMA_PTE_1G)
2649 		return SZ_1G;
2650 	else if (vma->gpuva.flags & (XE_VMA_PTE_2M | XE_VMA_PTE_COMPACT))
2651 		return SZ_2M;
2652 	else if (vma->gpuva.flags & XE_VMA_PTE_64K)
2653 		return SZ_64K;
2654 	else if (vma->gpuva.flags & XE_VMA_PTE_4K)
2655 		return SZ_4K;
2656 
2657 	return SZ_1G;	/* Uninitialized, used max size */
2658 }
2659 
2660 static void xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
2661 {
2662 	switch (size) {
2663 	case SZ_1G:
2664 		vma->gpuva.flags |= XE_VMA_PTE_1G;
2665 		break;
2666 	case SZ_2M:
2667 		vma->gpuva.flags |= XE_VMA_PTE_2M;
2668 		break;
2669 	case SZ_64K:
2670 		vma->gpuva.flags |= XE_VMA_PTE_64K;
2671 		break;
2672 	case SZ_4K:
2673 		vma->gpuva.flags |= XE_VMA_PTE_4K;
2674 		break;
2675 	}
2676 }
2677 
2678 static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
2679 {
2680 	int err = 0;
2681 
2682 	lockdep_assert_held_write(&vm->lock);
2683 
2684 	switch (op->base.op) {
2685 	case DRM_GPUVA_OP_MAP:
2686 		err |= xe_vm_insert_vma(vm, op->map.vma);
2687 		if (!err)
2688 			op->flags |= XE_VMA_OP_COMMITTED;
2689 		break;
2690 	case DRM_GPUVA_OP_REMAP:
2691 	{
2692 		u8 tile_present =
2693 			gpuva_to_vma(op->base.remap.unmap->va)->tile_present;
2694 
2695 		prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
2696 				 true);
2697 		op->flags |= XE_VMA_OP_COMMITTED;
2698 
2699 		if (op->remap.prev) {
2700 			err |= xe_vm_insert_vma(vm, op->remap.prev);
2701 			if (!err)
2702 				op->flags |= XE_VMA_OP_PREV_COMMITTED;
2703 			if (!err && op->remap.skip_prev) {
2704 				op->remap.prev->tile_present =
2705 					tile_present;
2706 			}
2707 		}
2708 		if (op->remap.next) {
2709 			err |= xe_vm_insert_vma(vm, op->remap.next);
2710 			if (!err)
2711 				op->flags |= XE_VMA_OP_NEXT_COMMITTED;
2712 			if (!err && op->remap.skip_next) {
2713 				op->remap.next->tile_present =
2714 					tile_present;
2715 			}
2716 		}
2717 
2718 		/*
2719 		 * Adjust for partial unbind after removing VMA from VM. In case
2720 		 * of unwind we might need to undo this later.
2721 		 */
2722 		if (!err) {
2723 			op->base.remap.unmap->va->va.addr = op->remap.start;
2724 			op->base.remap.unmap->va->va.range = op->remap.range;
2725 		}
2726 		break;
2727 	}
2728 	case DRM_GPUVA_OP_UNMAP:
2729 		prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
2730 		op->flags |= XE_VMA_OP_COMMITTED;
2731 		break;
2732 	case DRM_GPUVA_OP_PREFETCH:
2733 		op->flags |= XE_VMA_OP_COMMITTED;
2734 		break;
2735 	default:
2736 		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
2737 	}
2738 
2739 	return err;
2740 }
2741 
2742 /**
2743  * xe_vma_has_default_mem_attrs - Check if a VMA has default memory attributes
2744  * @vma: Pointer to the xe_vma structure to check
2745  *
2746  * This function determines whether the given VMA (Virtual Memory Area)
2747  * has its memory attributes set to their default values. Specifically,
2748  * it checks the following conditions:
2749  *
2750  * - `atomic_access` is `DRM_XE_VMA_ATOMIC_UNDEFINED`
2751  * - `pat_index` is equal to `default_pat_index`
2752  * - `preferred_loc.devmem_fd` is `DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE`
2753  * - `preferred_loc.migration_policy` is `DRM_XE_MIGRATE_ALL_PAGES`
2754  *
2755  * Return: true if all attributes are at their default values, false otherwise.
2756  */
2757 bool xe_vma_has_default_mem_attrs(struct xe_vma *vma)
2758 {
2759 	return (vma->attr.atomic_access == DRM_XE_ATOMIC_UNDEFINED &&
2760 		vma->attr.pat_index ==  vma->attr.default_pat_index &&
2761 		vma->attr.preferred_loc.devmem_fd == DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE &&
2762 		vma->attr.preferred_loc.migration_policy == DRM_XE_MIGRATE_ALL_PAGES);
2763 }
2764 
2765 static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
2766 				   struct xe_vma_ops *vops)
2767 {
2768 	struct xe_device *xe = vm->xe;
2769 	struct drm_gpuva_op *__op;
2770 	struct xe_tile *tile;
2771 	u8 id, tile_mask = 0;
2772 	int err = 0;
2773 
2774 	lockdep_assert_held_write(&vm->lock);
2775 
2776 	for_each_tile(tile, vm->xe, id)
2777 		tile_mask |= 0x1 << id;
2778 
2779 	drm_gpuva_for_each_op(__op, ops) {
2780 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2781 		struct xe_vma *vma;
2782 		unsigned int flags = 0;
2783 
2784 		INIT_LIST_HEAD(&op->link);
2785 		list_add_tail(&op->link, &vops->list);
2786 		op->tile_mask = tile_mask;
2787 
2788 		switch (op->base.op) {
2789 		case DRM_GPUVA_OP_MAP:
2790 		{
2791 			struct xe_vma_mem_attr default_attr = {
2792 				.preferred_loc = {
2793 					.devmem_fd = DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE,
2794 					.migration_policy = DRM_XE_MIGRATE_ALL_PAGES,
2795 				},
2796 				.atomic_access = DRM_XE_ATOMIC_UNDEFINED,
2797 				.default_pat_index = op->map.pat_index,
2798 				.pat_index = op->map.pat_index,
2799 				.purgeable_state = XE_MADV_PURGEABLE_WILLNEED,
2800 			};
2801 
2802 			flags |= op->map.vma_flags & XE_VMA_CREATE_MASK;
2803 
2804 			vma = new_vma(vm, &op->base.map, &default_attr,
2805 				      flags);
2806 			if (IS_ERR(vma))
2807 				return PTR_ERR(vma);
2808 
2809 			op->map.vma = vma;
2810 			if (((op->map.immediate || !xe_vm_in_fault_mode(vm)) &&
2811 			     !(op->map.vma_flags & XE_VMA_SYSTEM_ALLOCATOR)) ||
2812 			    op->map.invalidate_on_bind)
2813 				xe_vma_ops_incr_pt_update_ops(vops,
2814 							      op->tile_mask, 1);
2815 			break;
2816 		}
2817 		case DRM_GPUVA_OP_REMAP:
2818 		{
2819 			struct xe_vma *old =
2820 				gpuva_to_vma(op->base.remap.unmap->va);
2821 			bool skip = xe_vma_is_cpu_addr_mirror(old);
2822 			u64 start = xe_vma_start(old), end = xe_vma_end(old);
2823 			int num_remap_ops = 0;
2824 
2825 			if (op->base.remap.prev)
2826 				start = op->base.remap.prev->va.addr +
2827 					op->base.remap.prev->va.range;
2828 			if (op->base.remap.next)
2829 				end = op->base.remap.next->va.addr;
2830 
2831 			if (xe_vma_is_cpu_addr_mirror(old) &&
2832 			    xe_svm_has_mapping(vm, start, end)) {
2833 				if (vops->flags & XE_VMA_OPS_FLAG_MADVISE)
2834 					xe_svm_unmap_address_range(vm, start, end);
2835 				else
2836 					return -EBUSY;
2837 			}
2838 
2839 			op->remap.start = xe_vma_start(old);
2840 			op->remap.range = xe_vma_size(old);
2841 			op->remap.old_start = op->remap.start;
2842 			op->remap.old_range = op->remap.range;
2843 
2844 			flags |= op->base.remap.unmap->va->flags & XE_VMA_CREATE_MASK;
2845 			if (op->base.remap.prev) {
2846 				vma = new_vma(vm, op->base.remap.prev,
2847 					      &old->attr, flags);
2848 				if (IS_ERR(vma))
2849 					return PTR_ERR(vma);
2850 
2851 				op->remap.prev = vma;
2852 
2853 				/*
2854 				 * Userptr creates a new SG mapping so
2855 				 * we must also rebind.
2856 				 */
2857 				op->remap.skip_prev = skip ||
2858 					(!xe_vma_is_userptr(old) &&
2859 					IS_ALIGNED(xe_vma_end(vma),
2860 						   xe_vma_max_pte_size(old)));
2861 				if (op->remap.skip_prev) {
2862 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2863 					op->remap.range -=
2864 						xe_vma_end(vma) -
2865 						xe_vma_start(old);
2866 					op->remap.start = xe_vma_end(vma);
2867 					vm_dbg(&xe->drm, "REMAP:SKIP_PREV: addr=0x%016llx, range=0x%016llx",
2868 					       (ULL)op->remap.start,
2869 					       (ULL)op->remap.range);
2870 				} else {
2871 					num_remap_ops++;
2872 				}
2873 			}
2874 
2875 			if (op->base.remap.next) {
2876 				vma = new_vma(vm, op->base.remap.next,
2877 					      &old->attr, flags);
2878 				if (IS_ERR(vma))
2879 					return PTR_ERR(vma);
2880 
2881 				op->remap.next = vma;
2882 
2883 				/*
2884 				 * Userptr creates a new SG mapping so
2885 				 * we must also rebind.
2886 				 */
2887 				op->remap.skip_next = skip ||
2888 					(!xe_vma_is_userptr(old) &&
2889 					IS_ALIGNED(xe_vma_start(vma),
2890 						   xe_vma_max_pte_size(old)));
2891 				if (op->remap.skip_next) {
2892 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2893 					op->remap.range -=
2894 						xe_vma_end(old) -
2895 						xe_vma_start(vma);
2896 					vm_dbg(&xe->drm, "REMAP:SKIP_NEXT: addr=0x%016llx, range=0x%016llx",
2897 					       (ULL)op->remap.start,
2898 					       (ULL)op->remap.range);
2899 				} else {
2900 					num_remap_ops++;
2901 				}
2902 			}
2903 			if (!skip)
2904 				num_remap_ops++;
2905 
2906 			xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, num_remap_ops);
2907 			break;
2908 		}
2909 		case DRM_GPUVA_OP_UNMAP:
2910 			vma = gpuva_to_vma(op->base.unmap.va);
2911 
2912 			if (xe_vma_is_cpu_addr_mirror(vma) &&
2913 			    xe_svm_has_mapping(vm, xe_vma_start(vma),
2914 					       xe_vma_end(vma)) &&
2915 			    !(vops->flags & XE_VMA_OPS_FLAG_ALLOW_SVM_UNMAP))
2916 				return -EBUSY;
2917 
2918 			if (!xe_vma_is_cpu_addr_mirror(vma))
2919 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);
2920 			break;
2921 		case DRM_GPUVA_OP_PREFETCH:
2922 			vma = gpuva_to_vma(op->base.prefetch.va);
2923 
2924 			if (xe_vma_is_userptr(vma)) {
2925 				err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2926 				if (err)
2927 					return err;
2928 			}
2929 
2930 			if (xe_vma_is_cpu_addr_mirror(vma))
2931 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask,
2932 							      op->prefetch_range.ranges_count);
2933 			else
2934 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);
2935 
2936 			break;
2937 		default:
2938 			drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
2939 		}
2940 
2941 		err = xe_vma_op_commit(vm, op);
2942 		if (err)
2943 			return err;
2944 	}
2945 
2946 	return 0;
2947 }
2948 
2949 static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
2950 			     bool post_commit, bool prev_post_commit,
2951 			     bool next_post_commit)
2952 {
2953 	lockdep_assert_held_write(&vm->lock);
2954 
2955 	switch (op->base.op) {
2956 	case DRM_GPUVA_OP_MAP:
2957 		if (op->map.vma) {
2958 			prep_vma_destroy(vm, op->map.vma, post_commit);
2959 			xe_vma_destroy_unlocked(op->map.vma);
2960 		}
2961 		break;
2962 	case DRM_GPUVA_OP_UNMAP:
2963 	{
2964 		struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
2965 
2966 		if (vma) {
2967 			xe_svm_notifier_lock(vm);
2968 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2969 			xe_svm_notifier_unlock(vm);
2970 			if (post_commit)
2971 				xe_vm_insert_vma(vm, vma);
2972 		}
2973 		break;
2974 	}
2975 	case DRM_GPUVA_OP_REMAP:
2976 	{
2977 		struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
2978 
2979 		if (op->remap.prev) {
2980 			prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
2981 			xe_vma_destroy_unlocked(op->remap.prev);
2982 		}
2983 		if (op->remap.next) {
2984 			prep_vma_destroy(vm, op->remap.next, next_post_commit);
2985 			xe_vma_destroy_unlocked(op->remap.next);
2986 		}
2987 		if (vma) {
2988 			xe_svm_notifier_lock(vm);
2989 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2990 			xe_svm_notifier_unlock(vm);
2991 			if (post_commit) {
2992 				/*
2993 				 * Restore the old va range, in case of the
2994 				 * prev/next skip optimisation. Otherwise what
2995 				 * we re-insert here could be smaller than the
2996 				 * original range.
2997 				 */
2998 				op->base.remap.unmap->va->va.addr =
2999 					op->remap.old_start;
3000 				op->base.remap.unmap->va->va.range =
3001 					op->remap.old_range;
3002 				xe_vm_insert_vma(vm, vma);
3003 			}
3004 		}
3005 		break;
3006 	}
3007 	case DRM_GPUVA_OP_PREFETCH:
3008 		/* Nothing to do */
3009 		break;
3010 	default:
3011 		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
3012 	}
3013 }
3014 
3015 static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
3016 				     struct drm_gpuva_ops **ops,
3017 				     int num_ops_list)
3018 {
3019 	int i;
3020 
3021 	for (i = num_ops_list - 1; i >= 0; --i) {
3022 		struct drm_gpuva_ops *__ops = ops[i];
3023 		struct drm_gpuva_op *__op;
3024 
3025 		if (!__ops)
3026 			continue;
3027 
3028 		drm_gpuva_for_each_op_reverse(__op, __ops) {
3029 			struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
3030 
3031 			xe_vma_op_unwind(vm, op,
3032 					 op->flags & XE_VMA_OP_COMMITTED,
3033 					 op->flags & XE_VMA_OP_PREV_COMMITTED,
3034 					 op->flags & XE_VMA_OP_NEXT_COMMITTED);
3035 		}
3036 	}
3037 }
3038 
3039 /**
3040  * struct xe_vma_lock_and_validate_flags - Flags for vma_lock_and_validate()
3041  * @res_evict: Allow evicting resources during validation
3042  * @validate: Perform BO validation
3043  * @request_decompress: Request BO decompression
3044  * @check_purged: Reject operation if BO is DONTNEED or PURGED
3045  */
3046 struct xe_vma_lock_and_validate_flags {
3047 	u32 res_evict : 1;
3048 	u32 validate : 1;
3049 	u32 request_decompress : 1;
3050 	u32 check_purged : 1;
3051 };
3052 
3053 static int vma_lock_and_validate(struct drm_exec *exec, struct xe_vma *vma,
3054 				 struct xe_vma_lock_and_validate_flags flags)
3055 {
3056 	struct xe_bo *bo = xe_vma_bo(vma);
3057 	struct xe_vm *vm = xe_vma_vm(vma);
3058 	bool validate_bo = flags.validate;
3059 	int err = 0;
3060 
3061 	if (bo) {
3062 		if (!bo->vm)
3063 			err = drm_exec_lock_obj(exec, &bo->ttm.base);
3064 
3065 		/* Reject new mappings to DONTNEED/purged BOs; allow cleanup operations */
3066 		if (!err && flags.check_purged) {
3067 			if (xe_bo_madv_is_dontneed(bo))
3068 				err = -EBUSY;  /* BO marked purgeable */
3069 			else if (xe_bo_is_purged(bo))
3070 				err = -EINVAL; /* BO already purged */
3071 		}
3072 
3073 		/* Don't validate the BO for DONTNEED/PURGED remap remnants. */
3074 		if (vma->attr.purgeable_state != XE_MADV_PURGEABLE_WILLNEED)
3075 			validate_bo = false;
3076 
3077 		if (!err && validate_bo)
3078 			err = xe_bo_validate(bo, vm,
3079 					     xe_vm_allow_vm_eviction(vm) &&
3080 					     flags.res_evict, exec);
3081 
3082 		if (err)
3083 			return err;
3084 
3085 		if (flags.request_decompress)
3086 			err = xe_bo_decompress(bo);
3087 	}
3088 
3089 	return err;
3090 }
3091 
3092 static int check_ufence(struct xe_vma *vma)
3093 {
3094 	if (vma->ufence) {
3095 		struct xe_user_fence * const f = vma->ufence;
3096 
3097 		if (!xe_sync_ufence_get_status(f))
3098 			return -EBUSY;
3099 
3100 		vma->ufence = NULL;
3101 		xe_sync_ufence_put(f);
3102 	}
3103 
3104 	return 0;
3105 }
3106 
3107 static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_op *op)
3108 {
3109 	bool devmem_possible = IS_DGFX(vm->xe) && IS_ENABLED(CONFIG_DRM_XE_PAGEMAP);
3110 	struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
3111 	struct drm_pagemap *dpagemap = op->prefetch_range.dpagemap;
3112 	int err = 0;
3113 
3114 	struct xe_svm_range *svm_range;
3115 	struct drm_gpusvm_ctx ctx = {};
3116 	unsigned long i;
3117 
3118 	if (!xe_vma_is_cpu_addr_mirror(vma))
3119 		return 0;
3120 
3121 	ctx.read_only = xe_vma_read_only(vma);
3122 	ctx.devmem_possible = devmem_possible;
3123 	ctx.check_pages_threshold = devmem_possible ? SZ_64K : 0;
3124 	ctx.device_private_page_owner = xe_svm_private_page_owner(vm, !dpagemap);
3125 
3126 	/* TODO: Threading the migration */
3127 	xa_for_each(&op->prefetch_range.range, i, svm_range) {
3128 		if (!dpagemap)
3129 			xe_svm_range_migrate_to_smem(vm, svm_range);
3130 
3131 		if (IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)) {
3132 			drm_dbg(&vm->xe->drm,
3133 				"Prefetch pagemap is %s start 0x%016lx end 0x%016lx\n",
3134 				dpagemap ? dpagemap->drm->unique : "system",
3135 				xe_svm_range_start(svm_range), xe_svm_range_end(svm_range));
3136 		}
3137 
3138 		if (xe_svm_range_needs_migrate_to_vram(svm_range, vma, dpagemap)) {
3139 			err = xe_svm_alloc_vram(svm_range, &ctx, dpagemap);
3140 			if (err) {
3141 				drm_dbg(&vm->xe->drm, "VRAM allocation failed, retry from userspace, asid=%u, gpusvm=%p, errno=%pe\n",
3142 					vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
3143 				return -ENODATA;
3144 			}
3145 			xe_svm_range_debug(svm_range, "PREFETCH - RANGE MIGRATED TO VRAM");
3146 		}
3147 
3148 		err = xe_svm_range_get_pages(vm, svm_range, &ctx);
3149 		if (err) {
3150 			drm_dbg(&vm->xe->drm, "Get pages failed, asid=%u, gpusvm=%p, errno=%pe\n",
3151 				vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
3152 			if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM)
3153 				err = -ENODATA;
3154 			return err;
3155 		}
3156 		xe_svm_range_debug(svm_range, "PREFETCH - RANGE GET PAGES DONE");
3157 	}
3158 
3159 	return err;
3160 }
3161 
3162 static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm,
3163 			    struct xe_vma_ops *vops, struct xe_vma_op *op)
3164 {
3165 	int err = 0;
3166 	bool res_evict;
3167 
3168 	/*
3169 	 * We only allow evicting a BO within the VM if it is not part of an
3170 	 * array of binds, as an array of binds can evict another BO within the
3171 	 * bind.
3172 	 */
3173 	res_evict = !(vops->flags & XE_VMA_OPS_ARRAY_OF_BINDS);
3174 
3175 	switch (op->base.op) {
3176 	case DRM_GPUVA_OP_MAP:
3177 		if (!op->map.invalidate_on_bind)
3178 			err = vma_lock_and_validate(exec, op->map.vma,
3179 						    (struct xe_vma_lock_and_validate_flags) {
3180 							.res_evict = res_evict,
3181 							.validate = !xe_vm_in_fault_mode(vm) ||
3182 								    op->map.immediate,
3183 							.request_decompress =
3184 							op->map.request_decompress,
3185 							.check_purged = false,
3186 						    });
3187 		break;
3188 	case DRM_GPUVA_OP_REMAP:
3189 		err = check_ufence(gpuva_to_vma(op->base.remap.unmap->va));
3190 		if (err)
3191 			break;
3192 
3193 		err = vma_lock_and_validate(exec,
3194 					    gpuva_to_vma(op->base.remap.unmap->va),
3195 					    (struct xe_vma_lock_and_validate_flags) {
3196 						    .res_evict = res_evict,
3197 						    .validate = false,
3198 						    .request_decompress = false,
3199 						    .check_purged = false,
3200 					    });
3201 		if (!err && op->remap.prev)
3202 			err = vma_lock_and_validate(exec, op->remap.prev,
3203 						    (struct xe_vma_lock_and_validate_flags) {
3204 							    .res_evict = res_evict,
3205 							    .validate = true,
3206 							    .request_decompress = false,
3207 							    .check_purged = false,
3208 						    });
3209 		if (!err && op->remap.next)
3210 			err = vma_lock_and_validate(exec, op->remap.next,
3211 						    (struct xe_vma_lock_and_validate_flags) {
3212 							    .res_evict = res_evict,
3213 							    .validate = true,
3214 							    .request_decompress = false,
3215 							    .check_purged = false,
3216 						    });
3217 		break;
3218 	case DRM_GPUVA_OP_UNMAP:
3219 		err = check_ufence(gpuva_to_vma(op->base.unmap.va));
3220 		if (err)
3221 			break;
3222 
3223 		err = vma_lock_and_validate(exec,
3224 					    gpuva_to_vma(op->base.unmap.va),
3225 					    (struct xe_vma_lock_and_validate_flags) {
3226 						    .res_evict = res_evict,
3227 						    .validate = false,
3228 						    .request_decompress = false,
3229 						    .check_purged = false,
3230 					    });
3231 		break;
3232 	case DRM_GPUVA_OP_PREFETCH:
3233 	{
3234 		struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
3235 		u32 region;
3236 
3237 		if (!xe_vma_is_cpu_addr_mirror(vma)) {
3238 			region = op->prefetch.region;
3239 			xe_assert(vm->xe, region == DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC ||
3240 				  region <= ARRAY_SIZE(region_to_mem_type));
3241 		}
3242 
3243 		/*
3244 		 * PREFETCH is the only op that still gates on BO purge state.
3245 		 * MAP/REMAP handle this inside xe_vma_create() so partial
3246 		 * unbind on a DONTNEED BO still works. PREFETCH skips
3247 		 * xe_vma_create() and would migrate a BO with no backing
3248 		 * store, so reject DONTNEED/PURGED here.
3249 		 */
3250 		err = vma_lock_and_validate(exec,
3251 					    gpuva_to_vma(op->base.prefetch.va),
3252 					    (struct xe_vma_lock_and_validate_flags) {
3253 						    .res_evict = res_evict,
3254 						    .validate = false,
3255 						    .request_decompress = false,
3256 						    .check_purged = true,
3257 					    });
3258 		if (!err && !xe_vma_has_no_bo(vma))
3259 			err = xe_bo_migrate(xe_vma_bo(vma),
3260 					    region_to_mem_type[region],
3261 					    NULL,
3262 					    exec);
3263 		break;
3264 	}
3265 	default:
3266 		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
3267 	}
3268 
3269 	return err;
3270 }
3271 
3272 static int vm_bind_ioctl_ops_prefetch_ranges(struct xe_vm *vm, struct xe_vma_ops *vops)
3273 {
3274 	struct xe_vma_op *op;
3275 	int err;
3276 
3277 	if (!(vops->flags & XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH))
3278 		return 0;
3279 
3280 	list_for_each_entry(op, &vops->list, link) {
3281 		if (op->base.op  == DRM_GPUVA_OP_PREFETCH) {
3282 			err = prefetch_ranges(vm, op);
3283 			if (err)
3284 				return err;
3285 		}
3286 	}
3287 
3288 	return 0;
3289 }
3290 
3291 static int vm_bind_ioctl_ops_lock_and_prep(struct drm_exec *exec,
3292 					   struct xe_vm *vm,
3293 					   struct xe_vma_ops *vops)
3294 {
3295 	struct xe_vma_op *op;
3296 	int err;
3297 
3298 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
3299 	if (err)
3300 		return err;
3301 
3302 	list_for_each_entry(op, &vops->list, link) {
3303 		err = op_lock_and_prep(exec, vm, vops, op);
3304 		if (err)
3305 			return err;
3306 	}
3307 
3308 #ifdef TEST_VM_OPS_ERROR
3309 	if (vops->inject_error &&
3310 	    vm->xe->vm_inject_error_position == FORCE_OP_ERROR_LOCK)
3311 		return -ENOSPC;
3312 #endif
3313 
3314 	return 0;
3315 }
3316 
3317 static void op_trace(struct xe_vma_op *op)
3318 {
3319 	switch (op->base.op) {
3320 	case DRM_GPUVA_OP_MAP:
3321 		trace_xe_vma_bind(op->map.vma);
3322 		break;
3323 	case DRM_GPUVA_OP_REMAP:
3324 		trace_xe_vma_unbind(gpuva_to_vma(op->base.remap.unmap->va));
3325 		if (op->remap.prev)
3326 			trace_xe_vma_bind(op->remap.prev);
3327 		if (op->remap.next)
3328 			trace_xe_vma_bind(op->remap.next);
3329 		break;
3330 	case DRM_GPUVA_OP_UNMAP:
3331 		trace_xe_vma_unbind(gpuva_to_vma(op->base.unmap.va));
3332 		break;
3333 	case DRM_GPUVA_OP_PREFETCH:
3334 		trace_xe_vma_bind(gpuva_to_vma(op->base.prefetch.va));
3335 		break;
3336 	case DRM_GPUVA_OP_DRIVER:
3337 		break;
3338 	default:
3339 		XE_WARN_ON("NOT POSSIBLE");
3340 	}
3341 }
3342 
3343 static void trace_xe_vm_ops_execute(struct xe_vma_ops *vops)
3344 {
3345 	struct xe_vma_op *op;
3346 
3347 	list_for_each_entry(op, &vops->list, link)
3348 		op_trace(op);
3349 }
3350 
3351 static int vm_ops_setup_tile_args(struct xe_vm *vm, struct xe_vma_ops *vops)
3352 {
3353 	struct xe_exec_queue *q = vops->q;
3354 	struct xe_tile *tile;
3355 	int number_tiles = 0;
3356 	u8 id;
3357 
3358 	for_each_tile(tile, vm->xe, id) {
3359 		if (vops->pt_update_ops[id].num_ops)
3360 			++number_tiles;
3361 
3362 		if (vops->pt_update_ops[id].q)
3363 			continue;
3364 
3365 		if (q) {
3366 			vops->pt_update_ops[id].q = q;
3367 			if (vm->pt_root[id] && !list_empty(&q->multi_gt_list))
3368 				q = list_next_entry(q, multi_gt_list);
3369 		} else {
3370 			vops->pt_update_ops[id].q = vm->q[id];
3371 		}
3372 	}
3373 
3374 	return number_tiles;
3375 }
3376 
3377 static struct dma_fence *ops_execute(struct xe_vm *vm,
3378 				     struct xe_vma_ops *vops)
3379 {
3380 	struct xe_tile *tile;
3381 	struct dma_fence *fence = NULL;
3382 	struct dma_fence **fences = NULL;
3383 	struct dma_fence_array *cf = NULL;
3384 	int number_tiles = 0, current_fence = 0, n_fence = 0, err, i;
3385 	u8 id;
3386 
3387 	number_tiles = vm_ops_setup_tile_args(vm, vops);
3388 	if (number_tiles == 0)
3389 		return ERR_PTR(-ENODATA);
3390 
3391 	for_each_tile(tile, vm->xe, id) {
3392 		++n_fence;
3393 
3394 		if (!(vops->flags & XE_VMA_OPS_FLAG_SKIP_TLB_WAIT))
3395 			for_each_tlb_inval(i)
3396 				++n_fence;
3397 	}
3398 
3399 	fences = kmalloc_objs(*fences, n_fence);
3400 	if (!fences) {
3401 		fence = ERR_PTR(-ENOMEM);
3402 		goto err_trace;
3403 	}
3404 
3405 	cf = dma_fence_array_alloc(n_fence);
3406 	if (!cf) {
3407 		fence = ERR_PTR(-ENOMEM);
3408 		goto err_out;
3409 	}
3410 
3411 	for_each_tile(tile, vm->xe, id) {
3412 		if (!vops->pt_update_ops[id].num_ops)
3413 			continue;
3414 
3415 		err = xe_pt_update_ops_prepare(tile, vops);
3416 		if (err) {
3417 			fence = ERR_PTR(err);
3418 			goto err_out;
3419 		}
3420 	}
3421 
3422 	trace_xe_vm_ops_execute(vops);
3423 
3424 	for_each_tile(tile, vm->xe, id) {
3425 		struct xe_exec_queue *q = vops->pt_update_ops[tile->id].q;
3426 
3427 		fence = NULL;
3428 		if (!vops->pt_update_ops[id].num_ops)
3429 			goto collect_fences;
3430 
3431 		fence = xe_pt_update_ops_run(tile, vops);
3432 		if (IS_ERR(fence))
3433 			goto err_out;
3434 
3435 collect_fences:
3436 		fences[current_fence++] = fence ?: dma_fence_get_stub();
3437 		if (vops->flags & XE_VMA_OPS_FLAG_SKIP_TLB_WAIT)
3438 			continue;
3439 
3440 		xe_migrate_job_lock(tile->migrate, q);
3441 		for_each_tlb_inval(i)
3442 			fences[current_fence++] =
3443 				xe_exec_queue_tlb_inval_last_fence_get(q, vm, i);
3444 		xe_migrate_job_unlock(tile->migrate, q);
3445 	}
3446 
3447 	xe_assert(vm->xe, current_fence == n_fence);
3448 	dma_fence_array_init(cf, n_fence, fences, dma_fence_context_alloc(1),
3449 			     1);
3450 	fence = &cf->base;
3451 
3452 	for_each_tile(tile, vm->xe, id) {
3453 		if (!vops->pt_update_ops[id].num_ops)
3454 			continue;
3455 
3456 		xe_pt_update_ops_fini(tile, vops);
3457 	}
3458 
3459 	return fence;
3460 
3461 err_out:
3462 	for_each_tile(tile, vm->xe, id) {
3463 		if (!vops->pt_update_ops[id].num_ops)
3464 			continue;
3465 
3466 		xe_pt_update_ops_abort(tile, vops);
3467 	}
3468 	while (current_fence)
3469 		dma_fence_put(fences[--current_fence]);
3470 	kfree(fences);
3471 	kfree(cf);
3472 
3473 err_trace:
3474 	trace_xe_vm_ops_fail(vm);
3475 	return fence;
3476 }
3477 
3478 static void vma_add_ufence(struct xe_vma *vma, struct xe_user_fence *ufence)
3479 {
3480 	if (vma->ufence)
3481 		xe_sync_ufence_put(vma->ufence);
3482 	vma->ufence = __xe_sync_ufence_get(ufence);
3483 }
3484 
3485 static void op_add_ufence(struct xe_vm *vm, struct xe_vma_op *op,
3486 			  struct xe_user_fence *ufence)
3487 {
3488 	switch (op->base.op) {
3489 	case DRM_GPUVA_OP_MAP:
3490 		if (!xe_vma_is_cpu_addr_mirror(op->map.vma))
3491 			vma_add_ufence(op->map.vma, ufence);
3492 		break;
3493 	case DRM_GPUVA_OP_REMAP:
3494 		if (op->remap.prev)
3495 			vma_add_ufence(op->remap.prev, ufence);
3496 		if (op->remap.next)
3497 			vma_add_ufence(op->remap.next, ufence);
3498 		break;
3499 	case DRM_GPUVA_OP_UNMAP:
3500 		break;
3501 	case DRM_GPUVA_OP_PREFETCH:
3502 		vma_add_ufence(gpuva_to_vma(op->base.prefetch.va), ufence);
3503 		break;
3504 	default:
3505 		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
3506 	}
3507 }
3508 
3509 static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
3510 				   struct dma_fence *fence)
3511 {
3512 	struct xe_user_fence *ufence;
3513 	struct xe_vma_op *op;
3514 	int i;
3515 
3516 	ufence = find_ufence_get(vops->syncs, vops->num_syncs);
3517 	list_for_each_entry(op, &vops->list, link) {
3518 		if (ufence)
3519 			op_add_ufence(vm, op, ufence);
3520 
3521 		if (op->base.op == DRM_GPUVA_OP_UNMAP)
3522 			xe_vma_destroy(gpuva_to_vma(op->base.unmap.va), fence);
3523 		else if (op->base.op == DRM_GPUVA_OP_REMAP)
3524 			xe_vma_destroy(gpuva_to_vma(op->base.remap.unmap->va),
3525 				       fence);
3526 	}
3527 	if (ufence)
3528 		xe_sync_ufence_put(ufence);
3529 	if (fence) {
3530 		for (i = 0; i < vops->num_syncs; i++)
3531 			xe_sync_entry_signal(vops->syncs + i, fence);
3532 	}
3533 }
3534 
3535 static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
3536 						   struct xe_vma_ops *vops)
3537 {
3538 	struct xe_validation_ctx ctx;
3539 	struct drm_exec exec;
3540 	struct dma_fence *fence;
3541 	int err = 0;
3542 
3543 	lockdep_assert_held_write(&vm->lock);
3544 
3545 	xe_validation_guard(&ctx, &vm->xe->val, &exec,
3546 			    ((struct xe_val_flags) {
3547 				    .interruptible = true,
3548 				    .exec_ignore_duplicates = true,
3549 			    }), err) {
3550 		err = vm_bind_ioctl_ops_lock_and_prep(&exec, vm, vops);
3551 		drm_exec_retry_on_contention(&exec);
3552 		xe_validation_retry_on_oom(&ctx, &err);
3553 		if (err)
3554 			return ERR_PTR(err);
3555 
3556 		xe_vm_set_validation_exec(vm, &exec);
3557 		fence = ops_execute(vm, vops);
3558 		xe_vm_set_validation_exec(vm, NULL);
3559 		if (IS_ERR(fence)) {
3560 			if (PTR_ERR(fence) == -ENODATA)
3561 				vm_bind_ioctl_ops_fini(vm, vops, NULL);
3562 			return fence;
3563 		}
3564 
3565 		vm_bind_ioctl_ops_fini(vm, vops, fence);
3566 	}
3567 
3568 	return err ? ERR_PTR(err) : fence;
3569 }
3570 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);
3571 
3572 #define SUPPORTED_FLAGS_STUB  \
3573 	(DRM_XE_VM_BIND_FLAG_READONLY | \
3574 	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
3575 	 DRM_XE_VM_BIND_FLAG_NULL | \
3576 	 DRM_XE_VM_BIND_FLAG_DUMPABLE | \
3577 	 DRM_XE_VM_BIND_FLAG_CHECK_PXP | \
3578 	 DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR | \
3579 	 DRM_XE_VM_BIND_FLAG_MADVISE_AUTORESET | \
3580 	 DRM_XE_VM_BIND_FLAG_DECOMPRESS)
3581 
3582 #ifdef TEST_VM_OPS_ERROR
3583 #define SUPPORTED_FLAGS	(SUPPORTED_FLAGS_STUB | FORCE_OP_ERROR)
3584 #else
3585 #define SUPPORTED_FLAGS	SUPPORTED_FLAGS_STUB
3586 #endif
3587 
3588 #define XE_64K_PAGE_MASK 0xffffull
3589 #define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
3590 
3591 static int vm_bind_ioctl_check_args(struct xe_device *xe, struct xe_vm *vm,
3592 				    struct drm_xe_vm_bind *args,
3593 				    struct drm_xe_vm_bind_op **bind_ops)
3594 {
3595 	int err;
3596 	int i;
3597 
3598 	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
3599 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
3600 		return -EINVAL;
3601 
3602 	if (XE_IOCTL_DBG(xe, args->extensions))
3603 		return -EINVAL;
3604 
3605 	if (XE_IOCTL_DBG(xe, args->num_syncs > DRM_XE_MAX_SYNCS))
3606 		return -EINVAL;
3607 
3608 	if (args->num_binds > 1) {
3609 		u64 __user *bind_user =
3610 			u64_to_user_ptr(args->vector_of_binds);
3611 
3612 		*bind_ops = kvmalloc_objs(struct drm_xe_vm_bind_op,
3613 					  args->num_binds,
3614 					  GFP_KERNEL | __GFP_ACCOUNT | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3615 		if (!*bind_ops)
3616 			return args->num_binds > 1 ? -ENOBUFS : -ENOMEM;
3617 
3618 		err = copy_from_user(*bind_ops, bind_user,
3619 				     sizeof(struct drm_xe_vm_bind_op) *
3620 				     args->num_binds);
3621 		if (XE_IOCTL_DBG(xe, err)) {
3622 			err = -EFAULT;
3623 			goto free_bind_ops;
3624 		}
3625 	} else {
3626 		*bind_ops = &args->bind;
3627 	}
3628 
3629 	for (i = 0; i < args->num_binds; ++i) {
3630 		u64 range = (*bind_ops)[i].range;
3631 		u64 addr = (*bind_ops)[i].addr;
3632 		u32 op = (*bind_ops)[i].op;
3633 		u32 flags = (*bind_ops)[i].flags;
3634 		u32 obj = (*bind_ops)[i].obj;
3635 		u64 obj_offset = (*bind_ops)[i].obj_offset;
3636 		u32 prefetch_region = (*bind_ops)[i].prefetch_mem_region_instance;
3637 		bool is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
3638 		bool is_cpu_addr_mirror = flags &
3639 			DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR;
3640 		bool is_decompress = flags & DRM_XE_VM_BIND_FLAG_DECOMPRESS;
3641 		u16 pat_index = (*bind_ops)[i].pat_index;
3642 		u16 coh_mode;
3643 		bool comp_en;
3644 
3645 		if (XE_IOCTL_DBG(xe, is_cpu_addr_mirror &&
3646 				 (!xe_vm_in_fault_mode(vm) ||
3647 				 !IS_ENABLED(CONFIG_DRM_XE_GPUSVM)))) {
3648 			err = -EINVAL;
3649 			goto free_bind_ops;
3650 		}
3651 
3652 		if (XE_IOCTL_DBG(xe, pat_index >= xe->pat.n_entries)) {
3653 			err = -EINVAL;
3654 			goto free_bind_ops;
3655 		}
3656 
3657 		pat_index = array_index_nospec(pat_index, xe->pat.n_entries);
3658 		(*bind_ops)[i].pat_index = pat_index;
3659 		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3660 		comp_en = xe_pat_index_get_comp_en(xe, pat_index);
3661 		if (XE_IOCTL_DBG(xe, !coh_mode)) { /* hw reserved */
3662 			err = -EINVAL;
3663 			goto free_bind_ops;
3664 		}
3665 
3666 		if (XE_WARN_ON(coh_mode > XE_COH_2WAY)) {
3667 			err = -EINVAL;
3668 			goto free_bind_ops;
3669 		}
3670 
3671 		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
3672 		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
3673 		    XE_IOCTL_DBG(xe, obj && (is_null || is_cpu_addr_mirror)) ||
3674 		    XE_IOCTL_DBG(xe, obj_offset && (is_null ||
3675 						    is_cpu_addr_mirror)) ||
3676 		    XE_IOCTL_DBG(xe, op != DRM_XE_VM_BIND_OP_MAP &&
3677 				 (is_decompress || is_null || is_cpu_addr_mirror)) ||
3678 		    XE_IOCTL_DBG(xe, is_decompress &&
3679 				 xe_pat_index_get_comp_en(xe, pat_index)) ||
3680 		    XE_IOCTL_DBG(xe, !obj &&
3681 				 op == DRM_XE_VM_BIND_OP_MAP &&
3682 				 !is_null && !is_cpu_addr_mirror) ||
3683 		    XE_IOCTL_DBG(xe, !obj &&
3684 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3685 		    XE_IOCTL_DBG(xe, addr &&
3686 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3687 		    XE_IOCTL_DBG(xe, range &&
3688 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3689 		    XE_IOCTL_DBG(xe, obj &&
3690 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3691 		    XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3692 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3693 		    XE_IOCTL_DBG(xe, !IS_DGFX(xe) && coh_mode == XE_COH_NONE &&
3694 				 is_cpu_addr_mirror) ||
3695 		    XE_IOCTL_DBG(xe, xe_device_is_l2_flush_optimized(xe) &&
3696 				 (op == DRM_XE_VM_BIND_OP_MAP_USERPTR ||
3697 				  is_cpu_addr_mirror) &&
3698 				 (pat_index != 19 && coh_mode != XE_COH_2WAY)) ||
3699 		    XE_IOCTL_DBG(xe, comp_en &&
3700 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3701 		    XE_IOCTL_DBG(xe, op == DRM_XE_VM_BIND_OP_MAP_USERPTR &&
3702 				 !IS_ENABLED(CONFIG_DRM_GPUSVM)) ||
3703 		    XE_IOCTL_DBG(xe, obj &&
3704 				 op == DRM_XE_VM_BIND_OP_PREFETCH) ||
3705 		    XE_IOCTL_DBG(xe, prefetch_region &&
3706 				 op != DRM_XE_VM_BIND_OP_PREFETCH) ||
3707 		    XE_IOCTL_DBG(xe, (prefetch_region != DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC &&
3708 				      /* Guard against undefined shift in BIT(prefetch_region) */
3709 				      (prefetch_region >= (sizeof(xe->info.mem_region_mask) * 8) ||
3710 				      !(BIT(prefetch_region) & xe->info.mem_region_mask)))) ||
3711 		    XE_IOCTL_DBG(xe, obj &&
3712 				 op == DRM_XE_VM_BIND_OP_UNMAP) ||
3713 		    XE_IOCTL_DBG(xe, (flags & DRM_XE_VM_BIND_FLAG_MADVISE_AUTORESET) &&
3714 				 (!is_cpu_addr_mirror || op != DRM_XE_VM_BIND_OP_MAP))) {
3715 			err = -EINVAL;
3716 			goto free_bind_ops;
3717 		}
3718 
3719 		if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
3720 		    XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
3721 		    XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
3722 		    XE_IOCTL_DBG(xe, !range &&
3723 				 op != DRM_XE_VM_BIND_OP_UNMAP_ALL)) {
3724 			err = -EINVAL;
3725 			goto free_bind_ops;
3726 		}
3727 
3728 		if (is_decompress && (XE_IOCTL_DBG(xe, !xe_device_has_flat_ccs(xe)) ||
3729 				      XE_IOCTL_DBG(xe, GRAPHICS_VER(xe) < 20) ||
3730 				      XE_IOCTL_DBG(xe, !IS_DGFX(xe)))) {
3731 			err = -EOPNOTSUPP;
3732 			goto free_bind_ops;
3733 		}
3734 	}
3735 
3736 	return 0;
3737 
3738 free_bind_ops:
3739 	if (args->num_binds > 1)
3740 		kvfree(*bind_ops);
3741 	*bind_ops = NULL;
3742 	return err;
3743 }
3744 
3745 static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
3746 				       struct xe_exec_queue *q,
3747 				       struct xe_sync_entry *syncs,
3748 				       int num_syncs)
3749 {
3750 	struct dma_fence *fence = NULL;
3751 	int i, err = 0;
3752 
3753 	if (num_syncs) {
3754 		fence = xe_sync_in_fence_get(syncs, num_syncs,
3755 					     to_wait_exec_queue(vm, q), vm);
3756 		if (IS_ERR(fence))
3757 			return PTR_ERR(fence);
3758 
3759 		for (i = 0; i < num_syncs; i++)
3760 			xe_sync_entry_signal(&syncs[i], fence);
3761 	}
3762 
3763 	dma_fence_put(fence);
3764 
3765 	return err;
3766 }
3767 
3768 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
3769 			    struct xe_exec_queue *q,
3770 			    struct xe_sync_entry *syncs, u32 num_syncs)
3771 {
3772 	memset(vops, 0, sizeof(*vops));
3773 	INIT_LIST_HEAD(&vops->list);
3774 	vops->vm = vm;
3775 	vops->q = q;
3776 	vops->syncs = syncs;
3777 	vops->num_syncs = num_syncs;
3778 	vops->flags = 0;
3779 }
3780 
3781 static int xe_vm_bind_ioctl_validate_bo(struct xe_device *xe, struct xe_bo *bo,
3782 					u64 addr, u64 range, u64 obj_offset,
3783 					u16 pat_index, u32 op, u32 bind_flags)
3784 {
3785 	u16 coh_mode;
3786 	bool comp_en;
3787 
3788 	if (XE_IOCTL_DBG(xe, (bo->flags & XE_BO_FLAG_NO_COMPRESSION) &&
3789 			 xe_pat_index_get_comp_en(xe, pat_index)))
3790 		return -EINVAL;
3791 
3792 	if (XE_IOCTL_DBG(xe, range > xe_bo_size(bo)) ||
3793 	    XE_IOCTL_DBG(xe, obj_offset >
3794 			 xe_bo_size(bo) - range)) {
3795 		return -EINVAL;
3796 	}
3797 
3798 	/*
3799 	 * Some platforms require 64k VM_BIND alignment,
3800 	 * specifically those with XE_VRAM_FLAGS_NEED64K.
3801 	 *
3802 	 * Other platforms may have BO's set to 64k physical placement,
3803 	 * but can be mapped at 4k offsets anyway. This check is only
3804 	 * there for the former case.
3805 	 */
3806 	if ((bo->flags & XE_BO_FLAG_INTERNAL_64K) &&
3807 	    (xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)) {
3808 		if (XE_IOCTL_DBG(xe, obj_offset &
3809 				 XE_64K_PAGE_MASK) ||
3810 		    XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
3811 		    XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
3812 			return -EINVAL;
3813 		}
3814 	}
3815 
3816 	coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3817 	if (bo->cpu_caching) {
3818 		if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3819 				 bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)) {
3820 			return -EINVAL;
3821 		}
3822 	} else if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE)) {
3823 		/*
3824 		 * Imported dma-buf from a different device should
3825 		 * require 1way or 2way coherency since we don't know
3826 		 * how it was mapped on the CPU. Just assume is it
3827 		 * potentially cached on CPU side.
3828 		 */
3829 		return -EINVAL;
3830 	}
3831 
3832 	/*
3833 	 * Ensures that imported buffer objects (dma-bufs) are not mapped
3834 	 * with a PAT index that enables compression.
3835 	 */
3836 	comp_en = xe_pat_index_get_comp_en(xe, pat_index);
3837 	if (XE_IOCTL_DBG(xe, bo->ttm.base.import_attach && comp_en))
3838 		return -EINVAL;
3839 
3840 	if (XE_IOCTL_DBG(xe, bo->ttm.base.import_attach && xe_device_is_l2_flush_optimized(xe) &&
3841 			 (pat_index != 19 && coh_mode != XE_COH_2WAY)))
3842 		return -EINVAL;
3843 
3844 	/* If a BO is protected it can only be mapped if the key is still valid */
3845 	if ((bind_flags & DRM_XE_VM_BIND_FLAG_CHECK_PXP) && xe_bo_is_protected(bo) &&
3846 	    op != DRM_XE_VM_BIND_OP_UNMAP && op != DRM_XE_VM_BIND_OP_UNMAP_ALL)
3847 		if (XE_IOCTL_DBG(xe, xe_pxp_bo_key_check(xe->pxp, bo) != 0))
3848 			return -ENOEXEC;
3849 
3850 	return 0;
3851 }
3852 
3853 int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3854 {
3855 	struct xe_device *xe = to_xe_device(dev);
3856 	struct xe_file *xef = to_xe_file(file);
3857 	struct drm_xe_vm_bind *args = data;
3858 	struct drm_xe_sync __user *syncs_user;
3859 	struct xe_bo **bos = NULL;
3860 	struct drm_gpuva_ops **ops = NULL;
3861 	struct xe_vm *vm;
3862 	struct xe_exec_queue *q = NULL;
3863 	u32 num_syncs, num_ufence = 0;
3864 	struct xe_sync_entry *syncs = NULL;
3865 	struct drm_xe_vm_bind_op *bind_ops = NULL;
3866 	struct xe_vma_ops vops;
3867 	struct dma_fence *fence;
3868 	int err;
3869 	int i;
3870 
3871 	vm = xe_vm_lookup(xef, args->vm_id);
3872 	if (XE_IOCTL_DBG(xe, !vm))
3873 		return -EINVAL;
3874 
3875 	err = vm_bind_ioctl_check_args(xe, vm, args, &bind_ops);
3876 	if (err)
3877 		goto put_vm;
3878 
3879 	if (args->exec_queue_id) {
3880 		q = xe_exec_queue_lookup(xef, args->exec_queue_id);
3881 		if (XE_IOCTL_DBG(xe, !q)) {
3882 			err = -ENOENT;
3883 			goto free_bind_ops;
3884 		}
3885 
3886 		if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
3887 			err = -EINVAL;
3888 			goto put_exec_queue;
3889 		}
3890 	}
3891 
3892 	if (XE_IOCTL_DBG(xe, q && vm != q->user_vm)) {
3893 		err = -EINVAL;
3894 		goto put_exec_queue;
3895 	}
3896 
3897 	/* Ensure all UNMAPs visible */
3898 	xe_svm_flush(vm);
3899 
3900 	err = down_write_killable(&vm->lock);
3901 	if (err)
3902 		goto put_exec_queue;
3903 
3904 	if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
3905 		err = -ENOENT;
3906 		goto release_vm_lock;
3907 	}
3908 
3909 	for (i = 0; i < args->num_binds; ++i) {
3910 		u64 range = bind_ops[i].range;
3911 		u64 addr = bind_ops[i].addr;
3912 
3913 		if (XE_IOCTL_DBG(xe, range > vm->size) ||
3914 		    XE_IOCTL_DBG(xe, addr > vm->size - range)) {
3915 			err = -EINVAL;
3916 			goto release_vm_lock;
3917 		}
3918 	}
3919 
3920 	if (args->num_binds) {
3921 		bos = kvzalloc_objs(*bos, args->num_binds,
3922 				    GFP_KERNEL | __GFP_ACCOUNT | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3923 		if (!bos) {
3924 			err = -ENOMEM;
3925 			goto release_vm_lock;
3926 		}
3927 
3928 		ops = kvzalloc_objs(*ops, args->num_binds,
3929 				    GFP_KERNEL | __GFP_ACCOUNT | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3930 		if (!ops) {
3931 			err = -ENOMEM;
3932 			goto free_bos;
3933 		}
3934 	}
3935 
3936 	for (i = 0; i < args->num_binds; ++i) {
3937 		struct drm_gem_object *gem_obj;
3938 		u64 range = bind_ops[i].range;
3939 		u64 addr = bind_ops[i].addr;
3940 		u32 obj = bind_ops[i].obj;
3941 		u64 obj_offset = bind_ops[i].obj_offset;
3942 		u16 pat_index = bind_ops[i].pat_index;
3943 		u32 op = bind_ops[i].op;
3944 		u32 bind_flags = bind_ops[i].flags;
3945 
3946 		if (!obj)
3947 			continue;
3948 
3949 		gem_obj = drm_gem_object_lookup(file, obj);
3950 		if (XE_IOCTL_DBG(xe, !gem_obj)) {
3951 			err = -ENOENT;
3952 			goto put_obj;
3953 		}
3954 		bos[i] = gem_to_xe_bo(gem_obj);
3955 
3956 		err = xe_vm_bind_ioctl_validate_bo(xe, bos[i], addr, range,
3957 						   obj_offset, pat_index, op,
3958 						   bind_flags);
3959 		if (err)
3960 			goto put_obj;
3961 	}
3962 
3963 	if (args->num_syncs) {
3964 		syncs = kzalloc_objs(*syncs, args->num_syncs);
3965 		if (!syncs) {
3966 			err = -ENOMEM;
3967 			goto put_obj;
3968 		}
3969 	}
3970 
3971 	syncs_user = u64_to_user_ptr(args->syncs);
3972 	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
3973 		struct xe_exec_queue *__q = q ?: vm->q[0];
3974 
3975 		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
3976 					  &syncs_user[num_syncs],
3977 					  __q->ufence_syncobj,
3978 					  ++__q->ufence_timeline_value,
3979 					  (xe_vm_in_lr_mode(vm) ?
3980 					   SYNC_PARSE_FLAG_LR_MODE : 0) |
3981 					  (!args->num_binds ?
3982 					   SYNC_PARSE_FLAG_DISALLOW_USER_FENCE : 0));
3983 		if (err)
3984 			goto free_syncs;
3985 
3986 		if (xe_sync_is_ufence(&syncs[num_syncs]))
3987 			num_ufence++;
3988 	}
3989 
3990 	if (XE_IOCTL_DBG(xe, num_ufence > 1)) {
3991 		err = -EINVAL;
3992 		goto free_syncs;
3993 	}
3994 
3995 	if (!args->num_binds) {
3996 		err = -ENODATA;
3997 		goto free_syncs;
3998 	}
3999 
4000 	xe_vma_ops_init(&vops, vm, q, syncs, num_syncs);
4001 	if (args->num_binds > 1)
4002 		vops.flags |= XE_VMA_OPS_ARRAY_OF_BINDS;
4003 	for (i = 0; i < args->num_binds; ++i) {
4004 		u64 range = bind_ops[i].range;
4005 		u64 addr = bind_ops[i].addr;
4006 		u32 op = bind_ops[i].op;
4007 		u32 flags = bind_ops[i].flags;
4008 		u64 obj_offset = bind_ops[i].obj_offset;
4009 		u32 prefetch_region = bind_ops[i].prefetch_mem_region_instance;
4010 		u16 pat_index = bind_ops[i].pat_index;
4011 
4012 		ops[i] = vm_bind_ioctl_ops_create(vm, &vops, bos[i], obj_offset,
4013 						  addr, range, op, flags,
4014 						  prefetch_region, pat_index);
4015 		if (IS_ERR(ops[i])) {
4016 			err = PTR_ERR(ops[i]);
4017 			ops[i] = NULL;
4018 			goto unwind_ops;
4019 		}
4020 
4021 		err = vm_bind_ioctl_ops_parse(vm, ops[i], &vops);
4022 		if (err)
4023 			goto unwind_ops;
4024 
4025 #ifdef TEST_VM_OPS_ERROR
4026 		if (flags & FORCE_OP_ERROR) {
4027 			vops.inject_error = true;
4028 			vm->xe->vm_inject_error_position =
4029 				(vm->xe->vm_inject_error_position + 1) %
4030 				FORCE_OP_ERROR_COUNT;
4031 		}
4032 #endif
4033 	}
4034 
4035 	/* Nothing to do */
4036 	if (list_empty(&vops.list)) {
4037 		err = -ENODATA;
4038 		goto unwind_ops;
4039 	}
4040 
4041 	err = xe_vma_ops_alloc(&vops, args->num_binds > 1);
4042 	if (err)
4043 		goto unwind_ops;
4044 
4045 	err = vm_bind_ioctl_ops_prefetch_ranges(vm, &vops);
4046 	if (err)
4047 		goto unwind_ops;
4048 
4049 	fence = vm_bind_ioctl_ops_execute(vm, &vops);
4050 	if (IS_ERR(fence))
4051 		err = PTR_ERR(fence);
4052 	else
4053 		dma_fence_put(fence);
4054 
4055 unwind_ops:
4056 	if (err && err != -ENODATA)
4057 		vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
4058 	xe_vma_ops_fini(&vops);
4059 	for (i = args->num_binds - 1; i >= 0; --i)
4060 		if (ops[i])
4061 			drm_gpuva_ops_free(&vm->gpuvm, ops[i]);
4062 free_syncs:
4063 	if (err == -ENODATA)
4064 		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
4065 	while (num_syncs--)
4066 		xe_sync_entry_cleanup(&syncs[num_syncs]);
4067 
4068 	kfree(syncs);
4069 put_obj:
4070 	for (i = 0; i < args->num_binds; ++i)
4071 		xe_bo_put(bos[i]);
4072 
4073 	kvfree(ops);
4074 free_bos:
4075 	kvfree(bos);
4076 release_vm_lock:
4077 	up_write(&vm->lock);
4078 put_exec_queue:
4079 	if (q)
4080 		xe_exec_queue_put(q);
4081 free_bind_ops:
4082 	if (args->num_binds > 1)
4083 		kvfree(bind_ops);
4084 put_vm:
4085 	xe_vm_put(vm);
4086 	return err;
4087 }
4088 
4089 /*
4090  * Map access type, fault type, and fault level from current bspec
4091  * specification to user spec abstraction.  The current mapping is
4092  * approximately 1-to-1, with access type being the only notable
4093  * exception as it carries additional data with respect to prefetch
4094  * status that needs to be masked out.
4095  */
4096 static u8 xe_to_user_access_type(u8 access_type)
4097 {
4098 	return access_type & XE_PAGEFAULT_ACCESS_TYPE_MASK;
4099 }
4100 
4101 static u8 xe_to_user_fault_type(u8 fault_type)
4102 {
4103 	return fault_type;
4104 }
4105 
4106 static u8 xe_to_user_fault_level(u8 fault_level)
4107 {
4108 	return fault_level;
4109 }
4110 
4111 static int fill_faults(struct xe_vm *vm,
4112 		       struct drm_xe_vm_get_property *args)
4113 {
4114 	struct xe_vm_fault __user *usr_ptr = u64_to_user_ptr(args->data);
4115 	struct xe_vm_fault *fault_list, fault_entry = { 0 };
4116 	struct xe_vm_fault_entry *entry;
4117 	int ret = 0, i = 0, count, entry_size;
4118 
4119 	entry_size = sizeof(struct xe_vm_fault);
4120 	count = args->size / entry_size;
4121 
4122 	fault_list = kcalloc(count, sizeof(struct xe_vm_fault), GFP_KERNEL);
4123 	if (!fault_list)
4124 		return -ENOMEM;
4125 
4126 	spin_lock(&vm->faults.lock);
4127 	list_for_each_entry(entry, &vm->faults.list, list) {
4128 		if (i == count)
4129 			break;
4130 
4131 		fault_entry.address = xe_device_canonicalize_addr(vm->xe, entry->address);
4132 		fault_entry.address_precision = entry->address_precision;
4133 
4134 		fault_entry.access_type = xe_to_user_access_type(entry->access_type);
4135 		fault_entry.fault_type = xe_to_user_fault_type(entry->fault_type);
4136 		fault_entry.fault_level = xe_to_user_fault_level(entry->fault_level);
4137 
4138 		memcpy(&fault_list[i], &fault_entry, entry_size);
4139 
4140 		i++;
4141 	}
4142 	spin_unlock(&vm->faults.lock);
4143 
4144 	ret = copy_to_user(usr_ptr, fault_list, args->size);
4145 
4146 	kfree(fault_list);
4147 	return ret ? -EFAULT : 0;
4148 }
4149 
4150 static int xe_vm_get_property_helper(struct xe_vm *vm,
4151 				     struct drm_xe_vm_get_property *args)
4152 {
4153 	size_t size;
4154 
4155 	switch (args->property) {
4156 	case DRM_XE_VM_GET_PROPERTY_FAULTS:
4157 		spin_lock(&vm->faults.lock);
4158 		size = size_mul(sizeof(struct xe_vm_fault), vm->faults.len);
4159 		spin_unlock(&vm->faults.lock);
4160 
4161 		if (!args->size) {
4162 			args->size = size;
4163 			return 0;
4164 		}
4165 
4166 		/*
4167 		 * Number of faults may increase between calls to
4168 		 * xe_vm_get_property_ioctl, so just report the number of
4169 		 * faults the user requests if it's less than or equal to
4170 		 * the number of faults in the VM fault array.
4171 		 *
4172 		 * We should also at least assert that the args->size value
4173 		 * is a multiple of the xe_vm_fault struct size.
4174 		 */
4175 		if (args->size > size || args->size % sizeof(struct xe_vm_fault))
4176 			return -EINVAL;
4177 
4178 		return fill_faults(vm, args);
4179 	}
4180 	return -EINVAL;
4181 }
4182 
4183 int xe_vm_get_property_ioctl(struct drm_device *drm, void *data,
4184 			     struct drm_file *file)
4185 {
4186 	struct xe_device *xe = to_xe_device(drm);
4187 	struct xe_file *xef = to_xe_file(file);
4188 	struct drm_xe_vm_get_property *args = data;
4189 	struct xe_vm *vm;
4190 	int ret = 0;
4191 
4192 	if (XE_IOCTL_DBG(xe, (args->reserved[0] || args->reserved[1] ||
4193 			      args->reserved[2] || args->extensions ||
4194 			      args->pad)))
4195 		return -EINVAL;
4196 
4197 	vm = xe_vm_lookup(xef, args->vm_id);
4198 	if (XE_IOCTL_DBG(xe, !vm))
4199 		return -ENOENT;
4200 
4201 	ret = xe_vm_get_property_helper(vm, args);
4202 
4203 	xe_vm_put(vm);
4204 	return ret;
4205 }
4206 
4207 /**
4208  * xe_vm_bind_kernel_bo - bind a kernel BO to a VM
4209  * @vm: VM to bind the BO to
4210  * @bo: BO to bind
4211  * @q: exec queue to use for the bind (optional)
4212  * @addr: address at which to bind the BO
4213  * @cache_lvl: PAT cache level to use
4214  *
4215  * Execute a VM bind map operation on a kernel-owned BO to bind it into a
4216  * kernel-owned VM.
4217  *
4218  * Returns a dma_fence to track the binding completion if the job to do so was
4219  * successfully submitted, an error pointer otherwise.
4220  */
4221 struct dma_fence *xe_vm_bind_kernel_bo(struct xe_vm *vm, struct xe_bo *bo,
4222 				       struct xe_exec_queue *q, u64 addr,
4223 				       enum xe_cache_level cache_lvl)
4224 {
4225 	struct xe_vma_ops vops;
4226 	struct drm_gpuva_ops *ops = NULL;
4227 	struct dma_fence *fence;
4228 	int err;
4229 
4230 	xe_bo_get(bo);
4231 	xe_vm_get(vm);
4232 	if (q)
4233 		xe_exec_queue_get(q);
4234 
4235 	down_write(&vm->lock);
4236 
4237 	xe_vma_ops_init(&vops, vm, q, NULL, 0);
4238 
4239 	ops = vm_bind_ioctl_ops_create(vm, &vops, bo, 0, addr, xe_bo_size(bo),
4240 				       DRM_XE_VM_BIND_OP_MAP, 0, 0,
4241 				       xe_cache_pat_idx(vm->xe, cache_lvl));
4242 	if (IS_ERR(ops)) {
4243 		err = PTR_ERR(ops);
4244 		goto release_vm_lock;
4245 	}
4246 
4247 	err = vm_bind_ioctl_ops_parse(vm, ops, &vops);
4248 	if (err)
4249 		goto release_vm_lock;
4250 
4251 	xe_assert(vm->xe, !list_empty(&vops.list));
4252 
4253 	err = xe_vma_ops_alloc(&vops, false);
4254 	if (err)
4255 		goto unwind_ops;
4256 
4257 	fence = vm_bind_ioctl_ops_execute(vm, &vops);
4258 	if (IS_ERR(fence))
4259 		err = PTR_ERR(fence);
4260 
4261 unwind_ops:
4262 	if (err && err != -ENODATA)
4263 		vm_bind_ioctl_ops_unwind(vm, &ops, 1);
4264 
4265 	xe_vma_ops_fini(&vops);
4266 	drm_gpuva_ops_free(&vm->gpuvm, ops);
4267 
4268 release_vm_lock:
4269 	up_write(&vm->lock);
4270 
4271 	if (q)
4272 		xe_exec_queue_put(q);
4273 	xe_vm_put(vm);
4274 	xe_bo_put(bo);
4275 
4276 	if (err)
4277 		fence = ERR_PTR(err);
4278 
4279 	return fence;
4280 }
4281 
4282 /**
4283  * xe_vm_lock() - Lock the vm's dma_resv object
4284  * @vm: The struct xe_vm whose lock is to be locked
4285  * @intr: Whether to perform any wait interruptible
4286  *
4287  * Return: 0 on success, -EINTR if @intr is true and the wait for a
4288  * contended lock was interrupted. If @intr is false, the function
4289  * always returns 0.
4290  */
4291 int xe_vm_lock(struct xe_vm *vm, bool intr)
4292 {
4293 	int ret;
4294 
4295 	if (intr)
4296 		ret = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
4297 	else
4298 		ret = dma_resv_lock(xe_vm_resv(vm), NULL);
4299 
4300 	return ret;
4301 }
4302 
4303 /**
4304  * xe_vm_unlock() - Unlock the vm's dma_resv object
4305  * @vm: The struct xe_vm whose lock is to be released.
4306  *
4307  * Unlock a buffer object lock that was locked by xe_vm_lock().
4308  */
4309 void xe_vm_unlock(struct xe_vm *vm)
4310 {
4311 	dma_resv_unlock(xe_vm_resv(vm));
4312 }
4313 
4314 /**
4315  * xe_vm_invalidate_vma_submit - Submit a job to invalidate GPU mappings for
4316  * VMA.
4317  * @vma: VMA to invalidate
4318  * @batch: TLB invalidation batch to populate; caller must later call
4319  *         xe_tlb_inval_batch_wait() on it to wait for completion
4320  *
4321  * Walks a list of page tables leaves which it memset the entries owned by this
4322  * VMA to zero, invalidates the TLBs, but doesn't block waiting for TLB flush
4323  * to complete, but instead populates @batch which can be waited on using
4324  * xe_tlb_inval_batch_wait().
4325  *
4326  * Returns 0 for success, negative error code otherwise.
4327  */
4328 int xe_vm_invalidate_vma_submit(struct xe_vma *vma, struct xe_tlb_inval_batch *batch)
4329 {
4330 	struct xe_device *xe = xe_vma_vm(vma)->xe;
4331 	struct xe_vm *vm = xe_vma_vm(vma);
4332 	struct xe_tile *tile;
4333 	u8 tile_mask = 0;
4334 	int ret = 0;
4335 	u8 id;
4336 
4337 	xe_assert(xe, !xe_vma_is_null(vma));
4338 	xe_assert(xe, !xe_vma_is_cpu_addr_mirror(vma));
4339 	trace_xe_vma_invalidate(vma);
4340 
4341 	vm_dbg(&vm->xe->drm,
4342 	       "INVALIDATE: addr=0x%016llx, range=0x%016llx",
4343 		xe_vma_start(vma), xe_vma_size(vma));
4344 
4345 	/*
4346 	 * Check that we don't race with page-table updates, tile_invalidated
4347 	 * update is safe
4348 	 */
4349 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
4350 		if (xe_vma_is_userptr(vma)) {
4351 			lockdep_assert(lockdep_is_held_type(&vm->svm.gpusvm.notifier_lock, 0) ||
4352 				       (lockdep_is_held_type(&vm->svm.gpusvm.notifier_lock, 1) &&
4353 					lockdep_is_held(&xe_vm_resv(vm)->lock.base)));
4354 
4355 			WARN_ON_ONCE(!mmu_interval_check_retry
4356 				     (&to_userptr_vma(vma)->userptr.notifier,
4357 				      to_userptr_vma(vma)->userptr.pages.notifier_seq));
4358 			WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(vm),
4359 							     DMA_RESV_USAGE_BOOKKEEP));
4360 
4361 		} else {
4362 			xe_bo_assert_held(xe_vma_bo(vma));
4363 		}
4364 	}
4365 
4366 	for_each_tile(tile, xe, id)
4367 		if (xe_pt_zap_ptes(tile, vma))
4368 			tile_mask |= BIT(id);
4369 
4370 	xe_device_wmb(xe);
4371 
4372 	ret = xe_tlb_inval_range_tilemask_submit(xe, xe_vma_vm(vma)->usm.asid,
4373 						 xe_vma_start(vma), xe_vma_end(vma),
4374 						 tile_mask, batch);
4375 
4376 	/* WRITE_ONCE pairs with READ_ONCE in xe_vm_has_valid_gpu_mapping() */
4377 	WRITE_ONCE(vma->tile_invalidated, vma->tile_mask);
4378 	return ret;
4379 }
4380 
4381 /**
4382  * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
4383  * @vma: VMA to invalidate
4384  *
4385  * Walks a list of page tables leaves which it memset the entries owned by this
4386  * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
4387  * complete.
4388  *
4389  * Returns 0 for success, negative error code otherwise.
4390  */
4391 int xe_vm_invalidate_vma(struct xe_vma *vma)
4392 {
4393 	struct xe_tlb_inval_batch batch;
4394 	int ret;
4395 
4396 	ret = xe_vm_invalidate_vma_submit(vma, &batch);
4397 	if (ret)
4398 		return ret;
4399 
4400 	xe_tlb_inval_batch_wait(&batch);
4401 	return ret;
4402 }
4403 
4404 int xe_vm_validate_protected(struct xe_vm *vm)
4405 {
4406 	struct drm_gpuva *gpuva;
4407 	int err = 0;
4408 
4409 	if (!vm)
4410 		return -ENODEV;
4411 
4412 	mutex_lock(&vm->snap_mutex);
4413 
4414 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4415 		struct xe_vma *vma = gpuva_to_vma(gpuva);
4416 		struct xe_bo *bo = vma->gpuva.gem.obj ?
4417 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
4418 
4419 		if (!bo)
4420 			continue;
4421 
4422 		if (xe_bo_is_protected(bo)) {
4423 			err = xe_pxp_bo_key_check(vm->xe->pxp, bo);
4424 			if (err)
4425 				break;
4426 		}
4427 	}
4428 
4429 	mutex_unlock(&vm->snap_mutex);
4430 	return err;
4431 }
4432 
4433 struct xe_vm_snapshot {
4434 	int uapi_flags;
4435 	unsigned long num_snaps;
4436 	struct {
4437 		u64 ofs, bo_ofs;
4438 		unsigned long len;
4439 #define XE_VM_SNAP_FLAG_USERPTR		BIT(0)
4440 #define XE_VM_SNAP_FLAG_READ_ONLY	BIT(1)
4441 #define XE_VM_SNAP_FLAG_IS_NULL		BIT(2)
4442 		unsigned long flags;
4443 		int uapi_mem_region;
4444 		u16 pat_index;
4445 		int cpu_caching;
4446 		struct xe_bo *bo;
4447 		void *data;
4448 		struct mm_struct *mm;
4449 	} snap[];
4450 };
4451 
4452 struct xe_vm_snapshot *xe_vm_snapshot_capture(struct xe_vm *vm)
4453 {
4454 	unsigned long num_snaps = 0, i;
4455 	struct xe_vm_snapshot *snap = NULL;
4456 	struct drm_gpuva *gpuva;
4457 
4458 	if (!vm)
4459 		return NULL;
4460 
4461 	mutex_lock(&vm->snap_mutex);
4462 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4463 		if (gpuva->flags & XE_VMA_DUMPABLE)
4464 			num_snaps++;
4465 	}
4466 
4467 	if (num_snaps)
4468 		snap = kvzalloc(offsetof(struct xe_vm_snapshot, snap[num_snaps]), GFP_NOWAIT);
4469 	if (!snap) {
4470 		snap = num_snaps ? ERR_PTR(-ENOMEM) : ERR_PTR(-ENODEV);
4471 		goto out_unlock;
4472 	}
4473 
4474 	if (vm->flags & XE_VM_FLAG_FAULT_MODE)
4475 		snap->uapi_flags |= DRM_XE_VM_CREATE_FLAG_FAULT_MODE;
4476 	if (vm->flags & XE_VM_FLAG_LR_MODE)
4477 		snap->uapi_flags |= DRM_XE_VM_CREATE_FLAG_LR_MODE;
4478 	if (vm->flags & XE_VM_FLAG_SCRATCH_PAGE)
4479 		snap->uapi_flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE;
4480 
4481 	snap->num_snaps = num_snaps;
4482 	i = 0;
4483 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4484 		struct xe_vma *vma = gpuva_to_vma(gpuva);
4485 		struct xe_bo *bo = vma->gpuva.gem.obj ?
4486 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
4487 
4488 		if (!(gpuva->flags & XE_VMA_DUMPABLE))
4489 			continue;
4490 
4491 		snap->snap[i].ofs = xe_vma_start(vma);
4492 		snap->snap[i].len = xe_vma_size(vma);
4493 		snap->snap[i].flags = xe_vma_read_only(vma) ?
4494 			XE_VM_SNAP_FLAG_READ_ONLY : 0;
4495 		snap->snap[i].pat_index = vma->attr.pat_index;
4496 		if (bo) {
4497 			snap->snap[i].cpu_caching = bo->cpu_caching;
4498 			snap->snap[i].bo = xe_bo_get(bo);
4499 			snap->snap[i].bo_ofs = xe_vma_bo_offset(vma);
4500 			switch (bo->ttm.resource->mem_type) {
4501 			case XE_PL_SYSTEM:
4502 			case XE_PL_TT:
4503 				snap->snap[i].uapi_mem_region = 0;
4504 				break;
4505 			case XE_PL_VRAM0:
4506 				snap->snap[i].uapi_mem_region = 1;
4507 				break;
4508 			case XE_PL_VRAM1:
4509 				snap->snap[i].uapi_mem_region = 2;
4510 				break;
4511 			}
4512 		} else if (xe_vma_is_userptr(vma)) {
4513 			struct mm_struct *mm =
4514 				to_userptr_vma(vma)->userptr.notifier.mm;
4515 
4516 			if (mmget_not_zero(mm))
4517 				snap->snap[i].mm = mm;
4518 			else
4519 				snap->snap[i].data = ERR_PTR(-EFAULT);
4520 
4521 			snap->snap[i].bo_ofs = xe_vma_userptr(vma);
4522 			snap->snap[i].flags |= XE_VM_SNAP_FLAG_USERPTR;
4523 			snap->snap[i].uapi_mem_region = 0;
4524 		} else if (xe_vma_is_null(vma)) {
4525 			snap->snap[i].flags |= XE_VM_SNAP_FLAG_IS_NULL;
4526 			snap->snap[i].uapi_mem_region = -1;
4527 		} else {
4528 			snap->snap[i].data = ERR_PTR(-ENOENT);
4529 			snap->snap[i].uapi_mem_region = -1;
4530 		}
4531 		i++;
4532 	}
4533 
4534 out_unlock:
4535 	mutex_unlock(&vm->snap_mutex);
4536 	return snap;
4537 }
4538 
4539 void xe_vm_snapshot_capture_delayed(struct xe_vm_snapshot *snap)
4540 {
4541 	if (IS_ERR_OR_NULL(snap))
4542 		return;
4543 
4544 	for (int i = 0; i < snap->num_snaps; i++) {
4545 		struct xe_bo *bo = snap->snap[i].bo;
4546 		int err;
4547 
4548 		if (IS_ERR(snap->snap[i].data) ||
4549 		    snap->snap[i].flags & XE_VM_SNAP_FLAG_IS_NULL)
4550 			continue;
4551 
4552 		snap->snap[i].data = kvmalloc(snap->snap[i].len, GFP_USER);
4553 		if (!snap->snap[i].data) {
4554 			snap->snap[i].data = ERR_PTR(-ENOMEM);
4555 			goto cleanup_bo;
4556 		}
4557 
4558 		if (bo) {
4559 			err = xe_bo_read(bo, snap->snap[i].bo_ofs,
4560 					 snap->snap[i].data, snap->snap[i].len);
4561 		} else {
4562 			void __user *userptr = (void __user *)(size_t)snap->snap[i].bo_ofs;
4563 
4564 			kthread_use_mm(snap->snap[i].mm);
4565 			if (!copy_from_user(snap->snap[i].data, userptr, snap->snap[i].len))
4566 				err = 0;
4567 			else
4568 				err = -EFAULT;
4569 			kthread_unuse_mm(snap->snap[i].mm);
4570 
4571 			mmput(snap->snap[i].mm);
4572 			snap->snap[i].mm = NULL;
4573 		}
4574 
4575 		if (err) {
4576 			kvfree(snap->snap[i].data);
4577 			snap->snap[i].data = ERR_PTR(err);
4578 		}
4579 
4580 cleanup_bo:
4581 		xe_bo_put(bo);
4582 		snap->snap[i].bo = NULL;
4583 	}
4584 }
4585 
4586 void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p)
4587 {
4588 	unsigned long i, j;
4589 
4590 	if (IS_ERR_OR_NULL(snap)) {
4591 		drm_printf(p, "[0].error: %li\n", PTR_ERR(snap));
4592 		return;
4593 	}
4594 
4595 	drm_printf(p, "VM.uapi_flags: 0x%x\n", snap->uapi_flags);
4596 	for (i = 0; i < snap->num_snaps; i++) {
4597 		drm_printf(p, "[%llx].length: 0x%lx\n", snap->snap[i].ofs, snap->snap[i].len);
4598 
4599 		drm_printf(p, "[%llx].properties: %s|%s|mem_region=0x%lx|pat_index=%d|cpu_caching=%d\n",
4600 			   snap->snap[i].ofs,
4601 			   snap->snap[i].flags & XE_VM_SNAP_FLAG_READ_ONLY ?
4602 			   "read_only" : "read_write",
4603 			   snap->snap[i].flags & XE_VM_SNAP_FLAG_IS_NULL ?
4604 			   "null_sparse" :
4605 			   snap->snap[i].flags & XE_VM_SNAP_FLAG_USERPTR ?
4606 			   "userptr" : "bo",
4607 			   snap->snap[i].uapi_mem_region == -1 ? 0 :
4608 			   BIT(snap->snap[i].uapi_mem_region),
4609 			   snap->snap[i].pat_index,
4610 			   snap->snap[i].cpu_caching);
4611 
4612 		if (IS_ERR(snap->snap[i].data)) {
4613 			drm_printf(p, "[%llx].error: %li\n", snap->snap[i].ofs,
4614 				   PTR_ERR(snap->snap[i].data));
4615 			continue;
4616 		}
4617 
4618 		if (snap->snap[i].flags & XE_VM_SNAP_FLAG_IS_NULL)
4619 			continue;
4620 
4621 		drm_printf(p, "[%llx].data: ", snap->snap[i].ofs);
4622 
4623 		for (j = 0; j < snap->snap[i].len; j += sizeof(u32)) {
4624 			u32 *val = snap->snap[i].data + j;
4625 			char dumped[ASCII85_BUFSZ];
4626 
4627 			drm_puts(p, ascii85_encode(*val, dumped));
4628 		}
4629 
4630 		drm_puts(p, "\n");
4631 
4632 		if (drm_coredump_printer_is_full(p))
4633 			return;
4634 	}
4635 }
4636 
4637 void xe_vm_snapshot_free(struct xe_vm_snapshot *snap)
4638 {
4639 	unsigned long i;
4640 
4641 	if (IS_ERR_OR_NULL(snap))
4642 		return;
4643 
4644 	for (i = 0; i < snap->num_snaps; i++) {
4645 		if (!IS_ERR(snap->snap[i].data))
4646 			kvfree(snap->snap[i].data);
4647 		xe_bo_put(snap->snap[i].bo);
4648 		if (snap->snap[i].mm)
4649 			mmput(snap->snap[i].mm);
4650 	}
4651 	kvfree(snap);
4652 }
4653 
4654 /**
4655  * xe_vma_need_vram_for_atomic - Check if VMA needs VRAM migration for atomic operations
4656  * @xe: Pointer to the Xe device structure
4657  * @vma: Pointer to the virtual memory area (VMA) structure
4658  * @is_atomic: In pagefault path and atomic operation
4659  *
4660  * This function determines whether the given VMA needs to be migrated to
4661  * VRAM in order to do atomic GPU operation.
4662  *
4663  * Return:
4664  *   1        - Migration to VRAM is required
4665  *   0        - Migration is not required
4666  *   -EACCES  - Invalid access for atomic memory attr
4667  *
4668  */
4669 int xe_vma_need_vram_for_atomic(struct xe_device *xe, struct xe_vma *vma, bool is_atomic)
4670 {
4671 	u32 atomic_access = xe_vma_bo(vma) ? xe_vma_bo(vma)->attr.atomic_access :
4672 					     vma->attr.atomic_access;
4673 
4674 	if (!IS_DGFX(xe) || !is_atomic)
4675 		return false;
4676 
4677 	/*
4678 	 * NOTE: The checks implemented here are platform-specific. For
4679 	 * instance, on a device supporting CXL atomics, these would ideally
4680 	 * work universally without additional handling.
4681 	 */
4682 	switch (atomic_access) {
4683 	case DRM_XE_ATOMIC_DEVICE:
4684 		return !xe->info.has_device_atomics_on_smem;
4685 
4686 	case DRM_XE_ATOMIC_CPU:
4687 		return -EACCES;
4688 
4689 	case DRM_XE_ATOMIC_UNDEFINED:
4690 	case DRM_XE_ATOMIC_GLOBAL:
4691 	default:
4692 		return 1;
4693 	}
4694 }
4695 
4696 static int xe_vm_alloc_vma(struct xe_vm *vm,
4697 			   struct drm_gpuvm_map_req *map_req,
4698 			   bool is_madvise)
4699 {
4700 	struct xe_vma_ops vops;
4701 	struct drm_gpuva_ops *ops = NULL;
4702 	struct drm_gpuva_op *__op;
4703 	unsigned int vma_flags = 0;
4704 	bool remap_op = false;
4705 	struct xe_vma_mem_attr tmp_attr = {};
4706 	u16 default_pat;
4707 	int err;
4708 
4709 	lockdep_assert_held_write(&vm->lock);
4710 
4711 	if (is_madvise)
4712 		ops = drm_gpuvm_madvise_ops_create(&vm->gpuvm, map_req);
4713 	else
4714 		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, map_req);
4715 
4716 	if (IS_ERR(ops))
4717 		return PTR_ERR(ops);
4718 
4719 	if (list_empty(&ops->list)) {
4720 		err = 0;
4721 		goto free_ops;
4722 	}
4723 
4724 	drm_gpuva_for_each_op(__op, ops) {
4725 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
4726 		struct xe_vma *vma = NULL;
4727 
4728 		if (!is_madvise) {
4729 			if (__op->op == DRM_GPUVA_OP_UNMAP) {
4730 				vma = gpuva_to_vma(op->base.unmap.va);
4731 				XE_WARN_ON(!xe_vma_has_default_mem_attrs(vma));
4732 				default_pat = vma->attr.default_pat_index;
4733 				vma_flags = vma->gpuva.flags;
4734 			}
4735 
4736 			if (__op->op == DRM_GPUVA_OP_REMAP) {
4737 				vma = gpuva_to_vma(op->base.remap.unmap->va);
4738 				default_pat = vma->attr.default_pat_index;
4739 				vma_flags = vma->gpuva.flags;
4740 			}
4741 
4742 			if (__op->op == DRM_GPUVA_OP_MAP) {
4743 				op->map.vma_flags |= vma_flags & XE_VMA_CREATE_MASK;
4744 				op->map.pat_index = default_pat;
4745 			}
4746 		} else {
4747 			if (__op->op == DRM_GPUVA_OP_REMAP) {
4748 				vma = gpuva_to_vma(op->base.remap.unmap->va);
4749 				xe_assert(vm->xe, !remap_op);
4750 				xe_assert(vm->xe, xe_vma_has_no_bo(vma));
4751 				remap_op = true;
4752 				vma_flags = vma->gpuva.flags;
4753 			}
4754 
4755 			if (__op->op == DRM_GPUVA_OP_MAP) {
4756 				xe_assert(vm->xe, remap_op);
4757 				remap_op = false;
4758 				/*
4759 				 * In case of madvise ops DRM_GPUVA_OP_MAP is
4760 				 * always after DRM_GPUVA_OP_REMAP, so ensure
4761 				 * to propagate the flags from the vma we're
4762 				 * unmapping.
4763 				 */
4764 				op->map.vma_flags |= vma_flags & XE_VMA_CREATE_MASK;
4765 			}
4766 		}
4767 		print_op(vm->xe, __op);
4768 	}
4769 
4770 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
4771 
4772 	if (is_madvise)
4773 		vops.flags |= XE_VMA_OPS_FLAG_MADVISE;
4774 	else
4775 		vops.flags |= XE_VMA_OPS_FLAG_ALLOW_SVM_UNMAP;
4776 
4777 	err = vm_bind_ioctl_ops_parse(vm, ops, &vops);
4778 	if (err)
4779 		goto unwind_ops;
4780 
4781 	xe_vm_lock(vm, false);
4782 
4783 	drm_gpuva_for_each_op(__op, ops) {
4784 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
4785 		struct xe_vma *vma;
4786 
4787 		if (__op->op == DRM_GPUVA_OP_UNMAP) {
4788 			vma = gpuva_to_vma(op->base.unmap.va);
4789 			/* There should be no unmap for madvise */
4790 			if (is_madvise)
4791 				XE_WARN_ON("UNEXPECTED UNMAP");
4792 
4793 			xe_vma_destroy(vma, NULL);
4794 		} else if (__op->op == DRM_GPUVA_OP_REMAP) {
4795 			vma = gpuva_to_vma(op->base.remap.unmap->va);
4796 			/* In case of madvise ops Store attributes for REMAP UNMAPPED
4797 			 * VMA, so they can be assigned to newly MAP created vma.
4798 			 */
4799 			if (is_madvise)
4800 				xe_vma_mem_attr_copy(&tmp_attr, &vma->attr);
4801 
4802 			xe_vma_destroy(gpuva_to_vma(op->base.remap.unmap->va), NULL);
4803 		} else if (__op->op == DRM_GPUVA_OP_MAP) {
4804 			vma = op->map.vma;
4805 			/* In case of madvise call, MAP will always be followed by REMAP.
4806 			 * Therefore temp_attr will always have sane values, making it safe to
4807 			 * copy them to new vma.
4808 			 */
4809 			if (is_madvise)
4810 				xe_vma_mem_attr_copy(&vma->attr, &tmp_attr);
4811 		}
4812 	}
4813 
4814 	xe_vm_unlock(vm);
4815 	drm_gpuva_ops_free(&vm->gpuvm, ops);
4816 	xe_vma_mem_attr_fini(&tmp_attr);
4817 	return 0;
4818 
4819 unwind_ops:
4820 	vm_bind_ioctl_ops_unwind(vm, &ops, 1);
4821 free_ops:
4822 	drm_gpuva_ops_free(&vm->gpuvm, ops);
4823 	return err;
4824 }
4825 
4826 /**
4827  * xe_vm_alloc_madvise_vma - Allocate VMA's with madvise ops
4828  * @vm: Pointer to the xe_vm structure
4829  * @start: Starting input address
4830  * @range: Size of the input range
4831  *
4832  * This function splits existing vma to create new vma for user provided input range
4833  *
4834  * Return: 0 if success
4835  */
4836 int xe_vm_alloc_madvise_vma(struct xe_vm *vm, uint64_t start, uint64_t range)
4837 {
4838 	struct drm_gpuvm_map_req map_req = {
4839 		.map.va.addr = start,
4840 		.map.va.range = range,
4841 	};
4842 
4843 	lockdep_assert_held_write(&vm->lock);
4844 
4845 	vm_dbg(&vm->xe->drm, "MADVISE_OPS_CREATE: addr=0x%016llx, size=0x%016llx", start, range);
4846 
4847 	return xe_vm_alloc_vma(vm, &map_req, true);
4848 }
4849 
4850 static bool is_cpu_addr_vma_with_default_attr(struct xe_vma *vma)
4851 {
4852 	return vma && xe_vma_is_cpu_addr_mirror(vma) &&
4853 	       xe_vma_has_default_mem_attrs(vma);
4854 }
4855 
4856 /**
4857  * xe_vm_find_cpu_addr_mirror_vma_range - Extend a VMA range to include adjacent CPU-mirrored VMAs
4858  * @vm: VM to search within
4859  * @start: Input/output pointer to the starting address of the range
4860  * @end: Input/output pointer to the end address of the range
4861  *
4862  * Given a range defined by @start and @range, this function checks the VMAs
4863  * immediately before and after the range. If those neighboring VMAs are
4864  * CPU-address-mirrored and have default memory attributes, the function
4865  * updates @start and @range to include them. This extended range can then
4866  * be used for merging or other operations that require a unified VMA.
4867  *
4868  * The function does not perform the merge itself; it only computes the
4869  * mergeable boundaries.
4870  */
4871 void xe_vm_find_cpu_addr_mirror_vma_range(struct xe_vm *vm, u64 *start, u64 *end)
4872 {
4873 	struct xe_vma *prev, *next;
4874 
4875 	lockdep_assert_held(&vm->lock);
4876 
4877 	if (*start >= SZ_4K) {
4878 		prev = xe_vm_find_vma_by_addr(vm, *start - SZ_4K);
4879 		if (is_cpu_addr_vma_with_default_attr(prev))
4880 			*start = xe_vma_start(prev);
4881 	}
4882 
4883 	if (*end < vm->size) {
4884 		next = xe_vm_find_vma_by_addr(vm, *end + 1);
4885 		if (is_cpu_addr_vma_with_default_attr(next))
4886 			*end = xe_vma_end(next);
4887 	}
4888 }
4889 
4890 /**
4891  * xe_vm_alloc_cpu_addr_mirror_vma - Allocate CPU addr mirror vma
4892  * @vm: Pointer to the xe_vm structure
4893  * @start: Starting input address
4894  * @range: Size of the input range
4895  *
4896  * This function splits/merges existing vma to create new vma for user provided input range
4897  *
4898  * Return: 0 if success
4899  */
4900 int xe_vm_alloc_cpu_addr_mirror_vma(struct xe_vm *vm, uint64_t start, uint64_t range)
4901 {
4902 	struct drm_gpuvm_map_req map_req = {
4903 		.map.va.addr = start,
4904 		.map.va.range = range,
4905 	};
4906 
4907 	lockdep_assert_held_write(&vm->lock);
4908 
4909 	vm_dbg(&vm->xe->drm, "CPU_ADDR_MIRROR_VMA_OPS_CREATE: addr=0x%016llx, size=0x%016llx",
4910 	       start, range);
4911 
4912 	return xe_vm_alloc_vma(vm, &map_req, false);
4913 }
4914 
4915 /**
4916  * xe_vm_add_exec_queue() - Add exec queue to VM
4917  * @vm: The VM.
4918  * @q: The exec_queue
4919  *
4920  * Add exec queue to VM, skipped if the device does not have context based TLB
4921  * invalidations.
4922  */
4923 void xe_vm_add_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
4924 {
4925 	struct xe_device *xe = vm->xe;
4926 
4927 	/* User VMs and queues only */
4928 	xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_KERNEL));
4929 	xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_PERMANENT));
4930 	xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_VM));
4931 	xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_MIGRATE));
4932 	xe_assert(xe, vm->xef);
4933 	xe_assert(xe, vm == q->vm);
4934 
4935 	if (!xe->info.has_ctx_tlb_inval)
4936 		return;
4937 
4938 	down_write(&vm->exec_queues.lock);
4939 	list_add(&q->vm_exec_queue_link, &vm->exec_queues.list[q->gt->info.id]);
4940 	++vm->exec_queues.count[q->gt->info.id];
4941 	up_write(&vm->exec_queues.lock);
4942 }
4943 
4944 /**
4945  * xe_vm_remove_exec_queue() - Remove exec queue from VM
4946  * @vm: The VM.
4947  * @q: The exec_queue
4948  *
4949  * Remove exec queue from VM, skipped if the device does not have context based
4950  * TLB invalidations.
4951  */
4952 void xe_vm_remove_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
4953 {
4954 	if (!vm->xe->info.has_ctx_tlb_inval)
4955 		return;
4956 
4957 	down_write(&vm->exec_queues.lock);
4958 	if (!list_empty(&q->vm_exec_queue_link)) {
4959 		list_del(&q->vm_exec_queue_link);
4960 		--vm->exec_queues.count[q->gt->info.id];
4961 	}
4962 	up_write(&vm->exec_queues.lock);
4963 }
4964