xe_vm.c (revision a5210135489ae7bc1ef1cb4a8157361dd7b468cd) - OpenGrok cross reference for /linux/drivers/gpu/drm/xe/xe_vm.c

// SPDX-License-Identifier: MIT
/*
 * Copyright © 2021 Intel Corporation
 */

#include "xe_vm.h"

#include <linux/dma-fence-array.h>
#include <linux/nospec.h>

#include <drm/drm_drv.h>
#include <drm/drm_exec.h>
#include <drm/drm_print.h>
#include <drm/ttm/ttm_tt.h>
#include <uapi/drm/xe_drm.h>
#include <linux/ascii85.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/mm.h>
#include <linux/swap.h>

#include <generated/xe_wa_oob.h>

#include "regs/xe_gtt_defs.h"
#include "xe_assert.h"
#include "xe_bo.h"
#include "xe_device.h"
#include "xe_drm_client.h"
#include "xe_exec_queue.h"
#include "xe_gt.h"
#include "xe_migrate.h"
#include "xe_pat.h"
#include "xe_pm.h"
#include "xe_preempt_fence.h"
#include "xe_pt.h"
#include "xe_pxp.h"
#include "xe_sriov_vf.h"
#include "xe_svm.h"
#include "xe_sync.h"
#include "xe_tile.h"
#include "xe_tlb_inval.h"
#include "xe_trace_bo.h"
#include "xe_vm_madvise.h"
#include "xe_wa.h"

static struct drm_gem_object *xe_vm_obj(struct xe_vm *vm)
{
	return vm->gpuvm.r_obj;
}

/**
 * xe_vm_drm_exec_lock() - Lock the vm's resv with a drm_exec transaction
 * @vm: The vm whose resv is to be locked.
 * @exec: The drm_exec transaction.
 *
 * Helper to lock the vm's resv as part of a drm_exec transaction.
 *
 * Return: %0 on success. See drm_exec_lock_obj() for error codes.
 */
int xe_vm_drm_exec_lock(struct xe_vm *vm, struct drm_exec *exec)
{
	return drm_exec_lock_obj(exec, xe_vm_obj(vm));
}

static bool preempt_fences_waiting(struct xe_vm *vm)
{
	struct xe_exec_queue *q;

	lockdep_assert_held(&vm->lock);
	xe_vm_assert_held(vm);

	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
		if (!q->lr.pfence ||
		    test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
			     &q->lr.pfence->flags)) {
			return true;
		}
	}

	return false;
}

static void free_preempt_fences(struct list_head *list)
{
	struct list_head *link, *next;

	list_for_each_safe(link, next, list)
		xe_preempt_fence_free(to_preempt_fence_from_link(link));
}

static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
				unsigned int *count)
{
	lockdep_assert_held(&vm->lock);
	xe_vm_assert_held(vm);

	if (*count >= vm->preempt.num_exec_queues)
		return 0;

	for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
		struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();

		if (IS_ERR(pfence))
			return PTR_ERR(pfence);

		list_move_tail(xe_preempt_fence_link(pfence), list);
	}

	return 0;
}

static int wait_for_existing_preempt_fences(struct xe_vm *vm)
{
	struct xe_exec_queue *q;
	bool vf_migration = IS_SRIOV_VF(vm->xe) &&
		xe_sriov_vf_migration_supported(vm->xe);
	signed long wait_time = vf_migration ? HZ / 5 : MAX_SCHEDULE_TIMEOUT;

	xe_vm_assert_held(vm);

	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
		if (q->lr.pfence) {
			long timeout;

			timeout = dma_fence_wait_timeout(q->lr.pfence, false,
							 wait_time);
			if (!timeout) {
				xe_assert(vm->xe, vf_migration);
				return -EAGAIN;
			}

			/* Only -ETIME on fence indicates VM needs to be killed */
			if (timeout < 0 || q->lr.pfence->error == -ETIME)
				return -ETIME;

			dma_fence_put(q->lr.pfence);
			q->lr.pfence = NULL;
		}
	}

	return 0;
}

static bool xe_vm_is_idle(struct xe_vm *vm)
{
	struct xe_exec_queue *q;

	xe_vm_assert_held(vm);
	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
		if (!xe_exec_queue_is_idle(q))
			return false;
	}

	return true;
}

static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
{
	struct list_head *link;
	struct xe_exec_queue *q;

	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
		struct dma_fence *fence;

		link = list->next;
		xe_assert(vm->xe, link != list);

		fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
					     q, q->lr.context,
					     ++q->lr.seqno);
		dma_fence_put(q->lr.pfence);
		q->lr.pfence = fence;
	}
}

static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
{
	struct xe_exec_queue *q;
	int err;

	xe_bo_assert_held(bo);

	if (!vm->preempt.num_exec_queues)
		return 0;

	err = dma_resv_reserve_fences(bo->ttm.base.resv, vm->preempt.num_exec_queues);
	if (err)
		return err;

	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
		if (q->lr.pfence) {
			dma_resv_add_fence(bo->ttm.base.resv,
					   q->lr.pfence,
					   DMA_RESV_USAGE_BOOKKEEP);
		}

	return 0;
}

static void resume_and_reinstall_preempt_fences(struct xe_vm *vm,
						struct drm_exec *exec)
{
	struct xe_exec_queue *q;

	lockdep_assert_held(&vm->lock);
	xe_vm_assert_held(vm);

	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
		q->ops->resume(q);

		drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, q->lr.pfence,
					 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
	}
}

int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
{
	struct drm_gpuvm_exec vm_exec = {
		.vm = &vm->gpuvm,
		.flags = DRM_EXEC_INTERRUPTIBLE_WAIT,
		.num_fences = 1,
	};
	struct drm_exec *exec = &vm_exec.exec;
	struct xe_validation_ctx ctx;
	struct dma_fence *pfence;
	int err;
	bool wait;

	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));

	down_write(&vm->lock);
	err = xe_validation_exec_lock(&ctx, &vm_exec, &vm->xe->val);
	if (err)
		goto out_up_write;

	pfence = xe_preempt_fence_create(q, q->lr.context,
					 ++q->lr.seqno);
	if (IS_ERR(pfence)) {
		err = PTR_ERR(pfence);
		goto out_fini;
	}

	list_add(&q->lr.link, &vm->preempt.exec_queues);
	++vm->preempt.num_exec_queues;
	q->lr.pfence = pfence;

	xe_svm_notifier_lock(vm);

	drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, pfence,
				 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);

	/*
	 * Check to see if a preemption on VM is in flight or userptr
	 * invalidation, if so trigger this preempt fence to sync state with
	 * other preempt fences on the VM.
	 */
	wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
	if (wait)
		dma_fence_enable_sw_signaling(pfence);

	xe_svm_notifier_unlock(vm);

out_fini:
	xe_validation_ctx_fini(&ctx);
out_up_write:
	up_write(&vm->lock);

	return err;
}
ALLOW_ERROR_INJECTION(xe_vm_add_compute_exec_queue, ERRNO);

/**
 * xe_vm_remove_compute_exec_queue() - Remove compute exec queue from VM
 * @vm: The VM.
 * @q: The exec_queue
 *
 * Note that this function might be called multiple times on the same queue.
 */
void xe_vm_remove_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
{
	if (!xe_vm_in_preempt_fence_mode(vm))
		return;

	down_write(&vm->lock);
	if (!list_empty(&q->lr.link)) {
		list_del_init(&q->lr.link);
		--vm->preempt.num_exec_queues;
	}
	if (q->lr.pfence) {
		dma_fence_enable_sw_signaling(q->lr.pfence);
		dma_fence_put(q->lr.pfence);
		q->lr.pfence = NULL;
	}
	up_write(&vm->lock);
}

#define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000

/**
 * xe_vm_kill() - VM Kill
 * @vm: The VM.
 * @unlocked: Flag indicates the VM's dma-resv is not held
 *
 * Kill the VM by setting banned flag indicated VM is no longer available for
 * use. If in preempt fence mode, also kill all exec queue attached to the VM.
 */
void xe_vm_kill(struct xe_vm *vm, bool unlocked)
{
	struct xe_exec_queue *q;

	lockdep_assert_held(&vm->lock);

	if (unlocked)
		xe_vm_lock(vm, false);

	vm->flags |= XE_VM_FLAG_BANNED;
	trace_xe_vm_kill(vm);

	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
		q->ops->kill(q);

	if (unlocked)
		xe_vm_unlock(vm);

	/* TODO: Inform user the VM is banned */
}

static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
{
	struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
	struct xe_bo *bo = gem_to_xe_bo(vm_bo->obj);
	struct drm_gpuva *gpuva;
	int ret;

	lockdep_assert_held(&vm->lock);
	drm_gpuvm_bo_for_each_va(gpuva, vm_bo)
		list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
			       &vm->rebind_list);

	/* Skip re-populating purged BOs, rebind maps scratch pages. */
	if (xe_bo_is_purged(bo)) {
		vm_bo->evicted = false;
		return 0;
	}

	if (!try_wait_for_completion(&vm->xe->pm_block))
		return -EAGAIN;

	ret = xe_bo_validate(bo, vm, false, exec);
	if (ret)
		return ret;

	vm_bo->evicted = false;
	return 0;
}

/**
 * xe_vm_validate_rebind() - Validate buffer objects and rebind vmas
 * @vm: The vm for which we are rebinding.
 * @exec: The struct drm_exec with the locked GEM objects.
 * @num_fences: The number of fences to reserve for the operation, not
 * including rebinds and validations.
 *
 * Validates all evicted gem objects and rebinds their vmas. Note that
 * rebindings may cause evictions and hence the validation-rebind
 * sequence is rerun until there are no more objects to validate.
 *
 * Return: 0 on success, negative error code on error. In particular,
 * may return -EINTR or -ERESTARTSYS if interrupted, and -EDEADLK if
 * the drm_exec transaction needs to be restarted.
 */
int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,
			  unsigned int num_fences)
{
	struct drm_gem_object *obj;
	unsigned long index;
	int ret;

	do {
		ret = drm_gpuvm_validate(&vm->gpuvm, exec);
		if (ret)
			return ret;

		ret = xe_vm_rebind(vm, false);
		if (ret)
			return ret;
	} while (!list_empty(&vm->gpuvm.evict.list));

	drm_exec_for_each_locked_object(exec, index, obj) {
		ret = dma_resv_reserve_fences(obj->resv, num_fences);
		if (ret)
			return ret;
	}

	return 0;
}

static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
				 bool *done)
{
	int err;

	err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, 0);
	if (err)
		return err;

	if (xe_vm_is_idle(vm)) {
		vm->preempt.rebind_deactivated = true;
		*done = true;
		return 0;
	}

	if (!preempt_fences_waiting(vm)) {
		*done = true;
		return 0;
	}

	err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, 0);
	if (err)
		return err;

	err = wait_for_existing_preempt_fences(vm);
	if (err)
		return err;

	/*
	 * Add validation and rebinding to the locking loop since both can
	 * cause evictions which may require blocing dma_resv locks.
	 * The fence reservation here is intended for the new preempt fences
	 * we attach at the end of the rebind work.
	 */
	return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues);
}

static bool vm_suspend_rebind_worker(struct xe_vm *vm)
{
	struct xe_device *xe = vm->xe;
	bool ret = false;

	mutex_lock(&xe->rebind_resume_lock);
	if (!try_wait_for_completion(&vm->xe->pm_block)) {
		ret = true;
		list_move_tail(&vm->preempt.pm_activate_link, &xe->rebind_resume_list);
	}
	mutex_unlock(&xe->rebind_resume_lock);

	return ret;
}

/**
 * xe_vm_resume_rebind_worker() - Resume the rebind worker.
 * @vm: The vm whose preempt worker to resume.
 *
 * Resume a preempt worker that was previously suspended by
 * vm_suspend_rebind_worker().
 */
void xe_vm_resume_rebind_worker(struct xe_vm *vm)
{
	queue_work(vm->xe->ordered_wq, &vm->preempt.rebind_work);
}

static void preempt_rebind_work_func(struct work_struct *w)
{
	struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
	struct xe_validation_ctx ctx;
	struct drm_exec exec;
	unsigned int fence_count = 0;
	LIST_HEAD(preempt_fences);
	int err = 0;
	long wait;
	int __maybe_unused tries = 0;

	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
	trace_xe_vm_rebind_worker_enter(vm);

	down_write(&vm->lock);

	if (xe_vm_is_closed_or_banned(vm)) {
		up_write(&vm->lock);
		trace_xe_vm_rebind_worker_exit(vm);
		return;
	}

retry:
	if (!try_wait_for_completion(&vm->xe->pm_block) && vm_suspend_rebind_worker(vm)) {
		up_write(&vm->lock);
		/* We don't actually block but don't make progress. */
		xe_pm_might_block_on_suspend();
		return;
	}

	if (xe_vm_userptr_check_repin(vm)) {
		err = xe_vm_userptr_pin(vm);
		if (err)
			goto out_unlock_outer;
	}

	err = xe_validation_ctx_init(&ctx, &vm->xe->val, &exec,
				     (struct xe_val_flags) {.interruptible = true});
	if (err)
		goto out_unlock_outer;

	drm_exec_until_all_locked(&exec) {
		bool done = false;

		err = xe_preempt_work_begin(&exec, vm, &done);
		drm_exec_retry_on_contention(&exec);
		xe_validation_retry_on_oom(&ctx, &err);
		if (err || done) {
			xe_validation_ctx_fini(&ctx);
			goto out_unlock_outer;
		}
	}

	err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
	if (err)
		goto out_unlock;

	xe_vm_set_validation_exec(vm, &exec);
	err = xe_vm_rebind(vm, true);
	xe_vm_set_validation_exec(vm, NULL);
	if (err)
		goto out_unlock;

	/* Wait on rebinds and munmap style VM unbinds */
	wait = dma_resv_wait_timeout(xe_vm_resv(vm),
				     DMA_RESV_USAGE_KERNEL,
				     false, MAX_SCHEDULE_TIMEOUT);
	if (wait <= 0) {
		err = -ETIME;
		goto out_unlock;
	}

#define retry_required(__tries, __vm) \
	(IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
	(!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
	__xe_vm_userptr_needs_repin(__vm))

	xe_svm_notifier_lock(vm);
	if (retry_required(tries, vm)) {
		xe_svm_notifier_unlock(vm);
		err = -EAGAIN;
		goto out_unlock;
	}

#undef retry_required

	spin_lock(&vm->xe->ttm.lru_lock);
	ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
	spin_unlock(&vm->xe->ttm.lru_lock);

	/* Point of no return. */
	arm_preempt_fences(vm, &preempt_fences);
	resume_and_reinstall_preempt_fences(vm, &exec);
	xe_svm_notifier_unlock(vm);

out_unlock:
	xe_validation_ctx_fini(&ctx);
out_unlock_outer:
	if (err == -EAGAIN) {
		trace_xe_vm_rebind_worker_retry(vm);

		/*
		 * We can't block in workers on a VF which supports migration
		 * given this can block the VF post-migration workers from
		 * getting scheduled.
		 */
		if (IS_SRIOV_VF(vm->xe) &&
		    xe_sriov_vf_migration_supported(vm->xe)) {
			up_write(&vm->lock);
			xe_vm_queue_rebind_worker(vm);
			return;
		}

		goto retry;
	}

	if (err) {
		drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
		xe_vm_kill(vm, true);
	}
	up_write(&vm->lock);

	free_preempt_fences(&preempt_fences);

	trace_xe_vm_rebind_worker_exit(vm);
}

/**
 * xe_vm_add_fault_entry_pf() - Add pagefault to vm fault list
 * @vm: The VM.
 * @pf: The pagefault.
 *
 * This function takes the data from the pagefault @pf and saves it to @vm->faults.list.
 *
 * The function exits silently if the list is full, and reports a warning if the pagefault
 * could not be saved to the list.
 */
void xe_vm_add_fault_entry_pf(struct xe_vm *vm, struct xe_pagefault *pf)
{
	struct xe_vm_fault_entry *e;
	struct xe_hw_engine *hwe;

	/* Do not report faults on reserved engines */
	hwe = xe_gt_hw_engine(pf->gt, pf->consumer.engine_class,
			      pf->consumer.engine_instance, false);
	if (!hwe || xe_hw_engine_is_reserved(hwe))
		return;

	e = kzalloc_obj(*e);
	if (!e) {
		drm_warn(&vm->xe->drm,
			 "Could not allocate memory for fault!\n");
		return;
	}

	guard(spinlock)(&vm->faults.lock);

	/*
	 * Limit the number of faults in the fault list to prevent
	 * memory overuse.
	 */
	if (vm->faults.len >= MAX_FAULTS_SAVED_PER_VM) {
		kfree(e);
		return;
	}

	e->address = pf->consumer.page_addr;
	/*
	 * TODO:
	 * Address precision is currently always SZ_4K, but this may change
	 * in the future.
	 */
	e->address_precision = SZ_4K;
	e->access_type = pf->consumer.access_type;
	e->fault_type = FIELD_GET(XE_PAGEFAULT_TYPE_MASK,
				  pf->consumer.fault_type_level),
	e->fault_level = FIELD_GET(XE_PAGEFAULT_LEVEL_MASK,
				   pf->consumer.fault_type_level),

	list_add_tail(&e->list, &vm->faults.list);
	vm->faults.len++;
}

static void xe_vm_clear_fault_entries(struct xe_vm *vm)
{
	struct xe_vm_fault_entry *e, *tmp;

	guard(spinlock)(&vm->faults.lock);
	list_for_each_entry_safe(e, tmp, &vm->faults.list, list) {
		list_del(&e->list);
		kfree(e);
	}
	vm->faults.len = 0;
}

static int xe_vma_ops_alloc(struct xe_vma_ops *vops, bool array_of_binds)
{
	int i;

	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i) {
		if (!vops->pt_update_ops[i].num_ops)
			continue;

		vops->pt_update_ops[i].ops =
			kmalloc_objs(*vops->pt_update_ops[i].ops,
				     vops->pt_update_ops[i].num_ops,
				     GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
		if (!vops->pt_update_ops[i].ops)
			return array_of_binds ? -ENOBUFS : -ENOMEM;
	}

	return 0;
}
ALLOW_ERROR_INJECTION(xe_vma_ops_alloc, ERRNO);

static void xe_vma_svm_prefetch_op_fini(struct xe_vma_op *op)
{
	struct xe_vma *vma;

	vma = gpuva_to_vma(op->base.prefetch.va);

	if (op->base.op == DRM_GPUVA_OP_PREFETCH && xe_vma_is_cpu_addr_mirror(vma))
		xa_destroy(&op->prefetch_range.range);
}

static void xe_vma_svm_prefetch_ops_fini(struct xe_vma_ops *vops)
{
	struct xe_vma_op *op;

	if (!(vops->flags & XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH))
		return;

	list_for_each_entry(op, &vops->list, link)
		xe_vma_svm_prefetch_op_fini(op);
}

static void xe_vma_ops_fini(struct xe_vma_ops *vops)
{
	int i;

	xe_vma_svm_prefetch_ops_fini(vops);

	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
		kfree(vops->pt_update_ops[i].ops);
}

static void xe_vma_ops_incr_pt_update_ops(struct xe_vma_ops *vops, u8 tile_mask, int inc_val)
{
	int i;

	if (!inc_val)
		return;

	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
		if (BIT(i) & tile_mask)
			vops->pt_update_ops[i].num_ops += inc_val;
}

#define XE_VMA_CREATE_MASK (		    \
	XE_VMA_READ_ONLY |		    \
	XE_VMA_DUMPABLE |		    \
	XE_VMA_SYSTEM_ALLOCATOR |           \
	DRM_GPUVA_SPARSE |		    \
	XE_VMA_MADV_AUTORESET)

static void xe_vm_populate_rebind(struct xe_vma_op *op, struct xe_vma *vma,
				  u8 tile_mask)
{
	INIT_LIST_HEAD(&op->link);
	op->tile_mask = tile_mask;
	op->base.op = DRM_GPUVA_OP_MAP;
	op->base.map.va.addr = vma->gpuva.va.addr;
	op->base.map.va.range = vma->gpuva.va.range;
	op->base.map.gem.obj = vma->gpuva.gem.obj;
	op->base.map.gem.offset = vma->gpuva.gem.offset;
	op->map.vma = vma;
	op->map.immediate = true;
	op->map.vma_flags = vma->gpuva.flags & XE_VMA_CREATE_MASK;
}

static int xe_vm_ops_add_rebind(struct xe_vma_ops *vops, struct xe_vma *vma,
				u8 tile_mask)
{
	struct xe_vma_op *op;

	op = kzalloc_obj(*op);
	if (!op)
		return -ENOMEM;

	xe_vm_populate_rebind(op, vma, tile_mask);
	list_add_tail(&op->link, &vops->list);
	xe_vma_ops_incr_pt_update_ops(vops, tile_mask, 1);

	return 0;
}

static struct dma_fence *ops_execute(struct xe_vm *vm,
				     struct xe_vma_ops *vops);
static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
			    struct xe_exec_queue *q,
			    struct xe_sync_entry *syncs, u32 num_syncs);

int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
{
	struct dma_fence *fence;
	struct xe_vma *vma, *next;
	struct xe_vma_ops vops;
	struct xe_vma_op *op, *next_op;
	int err, i;

	lockdep_assert_held(&vm->lock);
	if ((xe_vm_in_lr_mode(vm) && !rebind_worker) ||
	    list_empty(&vm->rebind_list))
		return 0;

	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
		vops.pt_update_ops[i].wait_vm_bookkeep = true;

	xe_vm_assert_held(vm);
	list_for_each_entry(vma, &vm->rebind_list, combined_links.rebind) {
		xe_assert(vm->xe, vma->tile_present);

		if (rebind_worker)
			trace_xe_vma_rebind_worker(vma);
		else
			trace_xe_vma_rebind_exec(vma);

		err = xe_vm_ops_add_rebind(&vops, vma,
					   vma->tile_present);
		if (err)
			goto free_ops;
	}

	err = xe_vma_ops_alloc(&vops, false);
	if (err)
		goto free_ops;

	fence = ops_execute(vm, &vops);
	if (IS_ERR(fence)) {
		err = PTR_ERR(fence);
	} else {
		dma_fence_put(fence);
		list_for_each_entry_safe(vma, next, &vm->rebind_list,
					 combined_links.rebind)
			list_del_init(&vma->combined_links.rebind);
	}
free_ops:
	list_for_each_entry_safe(op, next_op, &vops.list, link) {
		list_del(&op->link);
		kfree(op);
	}
	xe_vma_ops_fini(&vops);

	return err;
}

struct dma_fence *xe_vma_rebind(struct xe_vm *vm, struct xe_vma *vma, u8 tile_mask)
{
	struct dma_fence *fence = NULL;
	struct xe_vma_ops vops;
	struct xe_vma_op *op, *next_op;
	struct xe_tile *tile;
	u8 id;
	int err;

	lockdep_assert_held(&vm->lock);
	xe_vm_assert_held(vm);
	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));

	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
	vops.flags |= XE_VMA_OPS_FLAG_SKIP_TLB_WAIT;
	for_each_tile(tile, vm->xe, id) {
		vops.pt_update_ops[id].wait_vm_bookkeep = true;
		vops.pt_update_ops[tile->id].q =
			xe_migrate_exec_queue(tile->migrate);
	}

	err = xe_vm_ops_add_rebind(&vops, vma, tile_mask);
	if (err)
		return ERR_PTR(err);

	err = xe_vma_ops_alloc(&vops, false);
	if (err) {
		fence = ERR_PTR(err);
		goto free_ops;
	}

	fence = ops_execute(vm, &vops);

free_ops:
	list_for_each_entry_safe(op, next_op, &vops.list, link) {
		list_del(&op->link);
		kfree(op);
	}
	xe_vma_ops_fini(&vops);

	return fence;
}

static void xe_vm_populate_range_rebind(struct xe_vma_op *op,
					struct xe_vma *vma,
					struct xe_svm_range *range,
					u8 tile_mask)
{
	INIT_LIST_HEAD(&op->link);
	op->tile_mask = tile_mask;
	op->base.op = DRM_GPUVA_OP_DRIVER;
	op->subop = XE_VMA_SUBOP_MAP_RANGE;
	op->map_range.vma = vma;
	op->map_range.range = range;
}

static int
xe_vm_ops_add_range_rebind(struct xe_vma_ops *vops,
			   struct xe_vma *vma,
			   struct xe_svm_range *range,
			   u8 tile_mask)
{
	struct xe_vma_op *op;

	op = kzalloc_obj(*op);
	if (!op)
		return -ENOMEM;

	xe_vm_populate_range_rebind(op, vma, range, tile_mask);
	list_add_tail(&op->link, &vops->list);
	xe_vma_ops_incr_pt_update_ops(vops, tile_mask, 1);

	return 0;
}

/**
 * xe_vm_range_rebind() - VM range (re)bind
 * @vm: The VM which the range belongs to.
 * @vma: The VMA which the range belongs to.
 * @range: SVM range to rebind.
 * @tile_mask: Tile mask to bind the range to.
 *
 * (re)bind SVM range setting up GPU page tables for the range.
 *
 * Return: dma fence for rebind to signal completion on success, ERR_PTR on
 * failure
 */
struct dma_fence *xe_vm_range_rebind(struct xe_vm *vm,
				     struct xe_vma *vma,
				     struct xe_svm_range *range,
				     u8 tile_mask)
{
	struct dma_fence *fence = NULL;
	struct xe_vma_ops vops;
	struct xe_vma_op *op, *next_op;
	struct xe_tile *tile;
	u8 id;
	int err;

	lockdep_assert_held(&vm->lock);
	xe_vm_assert_held(vm);
	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
	xe_assert(vm->xe, xe_vma_is_cpu_addr_mirror(vma));

	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
	vops.flags |= XE_VMA_OPS_FLAG_SKIP_TLB_WAIT;
	for_each_tile(tile, vm->xe, id) {
		vops.pt_update_ops[id].wait_vm_bookkeep = true;
		vops.pt_update_ops[tile->id].q =
			xe_migrate_exec_queue(tile->migrate);
	}

	err = xe_vm_ops_add_range_rebind(&vops, vma, range, tile_mask);
	if (err)
		return ERR_PTR(err);

	err = xe_vma_ops_alloc(&vops, false);
	if (err) {
		fence = ERR_PTR(err);
		goto free_ops;
	}

	fence = ops_execute(vm, &vops);

free_ops:
	list_for_each_entry_safe(op, next_op, &vops.list, link) {
		list_del(&op->link);
		kfree(op);
	}
	xe_vma_ops_fini(&vops);

	return fence;
}

static void xe_vm_populate_range_unbind(struct xe_vma_op *op,
					struct xe_svm_range *range)
{
	INIT_LIST_HEAD(&op->link);
	op->tile_mask = range->tile_present;
	op->base.op = DRM_GPUVA_OP_DRIVER;
	op->subop = XE_VMA_SUBOP_UNMAP_RANGE;
	op->unmap_range.range = range;
}

static int
xe_vm_ops_add_range_unbind(struct xe_vma_ops *vops,
			   struct xe_svm_range *range)
{
	struct xe_vma_op *op;

	op = kzalloc_obj(*op);
	if (!op)
		return -ENOMEM;

	xe_vm_populate_range_unbind(op, range);
	list_add_tail(&op->link, &vops->list);
	xe_vma_ops_incr_pt_update_ops(vops, range->tile_present, 1);

	return 0;
}

/**
 * xe_vm_range_unbind() - VM range unbind
 * @vm: The VM which the range belongs to.
 * @range: SVM range to rebind.
 *
 * Unbind SVM range removing the GPU page tables for the range.
 *
 * Return: dma fence for unbind to signal completion on success, ERR_PTR on
 * failure
 */
struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm,
				     struct xe_svm_range *range)
{
	struct dma_fence *fence = NULL;
	struct xe_vma_ops vops;
	struct xe_vma_op *op, *next_op;
	struct xe_tile *tile;
	u8 id;
	int err;

	lockdep_assert_held(&vm->lock);
	xe_vm_assert_held(vm);
	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));

	if (!range->tile_present)
		return dma_fence_get_stub();

	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
	for_each_tile(tile, vm->xe, id) {
		vops.pt_update_ops[id].wait_vm_bookkeep = true;
		vops.pt_update_ops[tile->id].q =
			xe_migrate_exec_queue(tile->migrate);
	}

	err = xe_vm_ops_add_range_unbind(&vops, range);
	if (err)
		return ERR_PTR(err);

	err = xe_vma_ops_alloc(&vops, false);
	if (err) {
		fence = ERR_PTR(err);
		goto free_ops;
	}

	fence = ops_execute(vm, &vops);

free_ops:
	list_for_each_entry_safe(op, next_op, &vops.list, link) {
		list_del(&op->link);
		kfree(op);
	}
	xe_vma_ops_fini(&vops);

	return fence;
}

static void xe_vma_mem_attr_fini(struct xe_vma_mem_attr *attr)
{
	drm_pagemap_put(attr->preferred_loc.dpagemap);
}

static void xe_vma_free(struct xe_vma *vma)
{
	xe_vma_mem_attr_fini(&vma->attr);

	if (xe_vma_is_userptr(vma))
		kfree(to_userptr_vma(vma));
	else
		kfree(vma);
}

/**
 * xe_vma_mem_attr_copy() - copy an xe_vma_mem_attr structure.
 * @to: Destination.
 * @from: Source.
 *
 * Copies an xe_vma_mem_attr structure taking care to get reference
 * counting of individual members right.
 */
void xe_vma_mem_attr_copy(struct xe_vma_mem_attr *to, struct xe_vma_mem_attr *from)
{
	xe_vma_mem_attr_fini(to);
	*to = *from;
	if (to->preferred_loc.dpagemap)
		drm_pagemap_get(to->preferred_loc.dpagemap);
}

static struct xe_vma *xe_vma_create(struct xe_vm *vm,
				    struct xe_bo *bo,
				    u64 bo_offset_or_userptr,
				    u64 start, u64 end,
				    struct xe_vma_mem_attr *attr,
				    unsigned int flags)
{
	struct xe_vma *vma;
	struct xe_tile *tile;
	u8 id;
	bool is_null = (flags & DRM_GPUVA_SPARSE);
	bool is_cpu_addr_mirror = (flags & XE_VMA_SYSTEM_ALLOCATOR);

	xe_assert(vm->xe, start < end);
	xe_assert(vm->xe, end < vm->size);

	/*
	 * Allocate and ensure that the xe_vma_is_userptr() return
	 * matches what was allocated.
	 */
	if (!bo && !is_null && !is_cpu_addr_mirror) {
		struct xe_userptr_vma *uvma = kzalloc_obj(*uvma);

		if (!uvma)
			return ERR_PTR(-ENOMEM);

		vma = &uvma->vma;
	} else {
		vma = kzalloc_obj(*vma);
		if (!vma)
			return ERR_PTR(-ENOMEM);

		if (bo)
			vma->gpuva.gem.obj = &bo->ttm.base;
	}

	INIT_LIST_HEAD(&vma->combined_links.rebind);

	INIT_LIST_HEAD(&vma->gpuva.gem.entry);
	vma->gpuva.vm = &vm->gpuvm;
	vma->gpuva.va.addr = start;
	vma->gpuva.va.range = end - start + 1;
	vma->gpuva.flags = flags;

	for_each_tile(tile, vm->xe, id)
		vma->tile_mask |= 0x1 << id;

	if (vm->xe->info.has_atomic_enable_pte_bit)
		vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;

	xe_vma_mem_attr_copy(&vma->attr, attr);
	if (bo) {
		struct drm_gpuvm_bo *vm_bo;

		xe_bo_assert_held(bo);

		vm_bo = drm_gpuvm_bo_obtain_locked(vma->gpuva.vm, &bo->ttm.base);
		if (IS_ERR(vm_bo)) {
			xe_vma_free(vma);
			return ERR_CAST(vm_bo);
		}

		drm_gpuvm_bo_extobj_add(vm_bo);
		drm_gem_object_get(&bo->ttm.base);
		vma->gpuva.gem.offset = bo_offset_or_userptr;
		drm_gpuva_link(&vma->gpuva, vm_bo);
		drm_gpuvm_bo_put(vm_bo);
	} else /* userptr or null */ {
		if (!is_null && !is_cpu_addr_mirror) {
			struct xe_userptr_vma *uvma = to_userptr_vma(vma);
			u64 size = end - start + 1;
			int err;

			vma->gpuva.gem.offset = bo_offset_or_userptr;

			err = xe_userptr_setup(uvma, xe_vma_userptr(vma), size);
			if (err) {
				xe_vma_free(vma);
				return ERR_PTR(err);
			}
		}

		xe_vm_get(vm);
	}

	return vma;
}

static void xe_vma_destroy_late(struct xe_vma *vma)
{
	struct xe_vm *vm = xe_vma_vm(vma);
	struct xe_bo *bo = xe_vma_bo(vma);

	if (vma->ufence) {
		xe_sync_ufence_put(vma->ufence);
		vma->ufence = NULL;
	}

	if (xe_vma_is_userptr(vma)) {
		struct xe_userptr_vma *uvma = to_userptr_vma(vma);

		xe_userptr_remove(uvma);
		xe_vm_put(vm);
	} else if (xe_vma_is_null(vma) || xe_vma_is_cpu_addr_mirror(vma)) {
		xe_vm_put(vm);
	} else {
		xe_bo_put(bo);
	}

	xe_vma_free(vma);
}

static void vma_destroy_work_func(struct work_struct *w)
{
	struct xe_vma *vma =
		container_of(w, struct xe_vma, destroy_work);

	xe_vma_destroy_late(vma);
}

static void vma_destroy_cb(struct dma_fence *fence,
			   struct dma_fence_cb *cb)
{
	struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);

	INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
	queue_work(system_dfl_wq, &vma->destroy_work);
}

static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
{
	struct xe_vm *vm = xe_vma_vm(vma);
	struct xe_bo *bo = xe_vma_bo(vma);

	lockdep_assert_held_write(&vm->lock);
	xe_assert(vm->xe, list_empty(&vma->combined_links.destroy));

	if (xe_vma_is_userptr(vma)) {
		xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED);
		xe_userptr_destroy(to_userptr_vma(vma));
	} else if (!xe_vma_is_null(vma) && !xe_vma_is_cpu_addr_mirror(vma)) {
		xe_bo_assert_held(bo);

		drm_gpuva_unlink(&vma->gpuva);
		xe_bo_recompute_purgeable_state(bo);
	}

	xe_vm_assert_held(vm);
	if (fence) {
		int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
						 vma_destroy_cb);

		if (ret) {
			XE_WARN_ON(ret != -ENOENT);
			xe_vma_destroy_late(vma);
		}
	} else {
		xe_vma_destroy_late(vma);
	}
}

/**
 * xe_vm_lock_vma() - drm_exec utility to lock a vma
 * @exec: The drm_exec object we're currently locking for.
 * @vma: The vma for witch we want to lock the vm resv and any attached
 * object's resv.
 *
 * Return: 0 on success, negative error code on error. In particular
 * may return -EDEADLK on WW transaction contention and -EINTR if
 * an interruptible wait is terminated by a signal.
 */
int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
{
	struct xe_vm *vm = xe_vma_vm(vma);
	struct xe_bo *bo = xe_vma_bo(vma);
	int err;

	XE_WARN_ON(!vm);

	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
	if (!err && bo && !bo->vm)
		err = drm_exec_lock_obj(exec, &bo->ttm.base);

	return err;
}

static void xe_vma_destroy_unlocked(struct xe_vma *vma)
{
	struct xe_device *xe = xe_vma_vm(vma)->xe;
	struct xe_validation_ctx ctx;
	struct drm_exec exec;
	int err = 0;

	xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {}, err) {
		err = xe_vm_lock_vma(&exec, vma);
		drm_exec_retry_on_contention(&exec);
		if (XE_WARN_ON(err))
			break;
		xe_vma_destroy(vma, NULL);
	}
	xe_assert(xe, !err);
}

struct xe_vma *
xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
{
	struct drm_gpuva *gpuva;

	lockdep_assert_held(&vm->lock);

	if (xe_vm_is_closed_or_banned(vm))
		return NULL;

	xe_assert(vm->xe, start + range <= vm->size);

	gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);

	return gpuva ? gpuva_to_vma(gpuva) : NULL;
}

static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
{
	int err;

	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
	lockdep_assert_held(&vm->lock);

	mutex_lock(&vm->snap_mutex);
	err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
	mutex_unlock(&vm->snap_mutex);
	XE_WARN_ON(err);	/* Shouldn't be possible */

	return err;
}

static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
{
	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
	lockdep_assert_held(&vm->lock);

	mutex_lock(&vm->snap_mutex);
	drm_gpuva_remove(&vma->gpuva);
	mutex_unlock(&vm->snap_mutex);
	if (vm->usm.last_fault_vma == vma)
		vm->usm.last_fault_vma = NULL;
}

static struct drm_gpuva_op *xe_vm_op_alloc(void)
{
	struct xe_vma_op *op;

	op = kzalloc_obj(*op);

	if (unlikely(!op))
		return NULL;

	return &op->base;
}

static void xe_vm_free(struct drm_gpuvm *gpuvm);

static const struct drm_gpuvm_ops gpuvm_ops = {
	.op_alloc = xe_vm_op_alloc,
	.vm_bo_validate = xe_gpuvm_validate,
	.vm_free = xe_vm_free,
};

static u64 pde_encode_pat_index(u16 pat_index)
{
	u64 pte = 0;

	if (pat_index & BIT(0))
		pte |= XE_PPGTT_PTE_PAT0;

	if (pat_index & BIT(1))
		pte |= XE_PPGTT_PTE_PAT1;

	return pte;
}

static u64 pte_encode_pat_index(u16 pat_index, u32 pt_level)
{
	u64 pte = 0;

	if (pat_index & BIT(0))
		pte |= XE_PPGTT_PTE_PAT0;

	if (pat_index & BIT(1))
		pte |= XE_PPGTT_PTE_PAT1;

	if (pat_index & BIT(2)) {
		if (pt_level)
			pte |= XE_PPGTT_PDE_PDPE_PAT2;
		else
			pte |= XE_PPGTT_PTE_PAT2;
	}

	if (pat_index & BIT(3))
		pte |= XELPG_PPGTT_PTE_PAT3;

	if (pat_index & (BIT(4)))
		pte |= XE2_PPGTT_PTE_PAT4;

	return pte;
}

static u64 pte_encode_ps(u32 pt_level)
{
	XE_WARN_ON(pt_level > MAX_HUGEPTE_LEVEL);

	if (pt_level == 1)
		return XE_PDE_PS_2M;
	else if (pt_level == 2)
		return XE_PDPE_PS_1G;

	return 0;
}

static u16 pde_pat_index(struct xe_bo *bo)
{
	struct xe_device *xe = xe_bo_device(bo);
	u16 pat_index;

	/*
	 * We only have two bits to encode the PAT index in non-leaf nodes, but
	 * these only point to other paging structures so we only need a minimal
	 * selection of options. The user PAT index is only for encoding leaf
	 * nodes, where we have use of more bits to do the encoding. The
	 * non-leaf nodes are instead under driver control so the chosen index
	 * here should be distinct from the user PAT index. Also the
	 * corresponding coherency of the PAT index should be tied to the
	 * allocation type of the page table (or at least we should pick
	 * something which is always safe).
	 */
	if (!xe_bo_is_vram(bo) && bo->ttm.ttm->caching == ttm_cached)
		pat_index = xe->pat.idx[XE_CACHE_WB];
	else
		pat_index = xe->pat.idx[XE_CACHE_NONE];

	xe_assert(xe, pat_index <= 3);

	return pat_index;
}

static u64 xelp_pde_encode_bo(struct xe_bo *bo, u64 bo_offset)
{
	u64 pde;

	pde = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
	pde |= XE_PAGE_PRESENT | XE_PAGE_RW;
	pde |= pde_encode_pat_index(pde_pat_index(bo));

	return pde;
}

static u64 xelp_pte_encode_bo(struct xe_bo *bo, u64 bo_offset,
			      u16 pat_index, u32 pt_level)
{
	u64 pte;

	pte = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
	pte |= pte_encode_pat_index(pat_index, pt_level);
	pte |= pte_encode_ps(pt_level);

	if (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo))
		pte |= XE_PPGTT_PTE_DM;

	return pte;
}

static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma,
			       u16 pat_index, u32 pt_level)
{
	struct xe_bo *bo = xe_vma_bo(vma);
	struct xe_vm *vm = xe_vma_vm(vma);

	pte |= XE_PAGE_PRESENT;

	if (likely(!xe_vma_read_only(vma)))
		pte |= XE_PAGE_RW;

	pte |= pte_encode_pat_index(pat_index, pt_level);
	pte |= pte_encode_ps(pt_level);

	/*
	 * NULL PTEs redirect to scratch page (return zeros on read).
	 * Set for: 1) explicit null VMAs, 2) purged BOs on scratch VMs.
	 * Never set NULL flag without scratch page - causes undefined behavior.
	 */
	if (unlikely(xe_vma_is_null(vma) ||
		     (bo && xe_bo_is_purged(bo) && xe_vm_has_scratch(vm))))
		pte |= XE_PTE_NULL;

	return pte;
}

static u64 xelp_pte_encode_addr(struct xe_device *xe, u64 addr,
				u16 pat_index,
				u32 pt_level, bool devmem, u64 flags)
{
	u64 pte;

	/* Avoid passing random bits directly as flags */
	xe_assert(xe, !(flags & ~XE_PTE_PS64));

	pte = addr;
	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
	pte |= pte_encode_pat_index(pat_index, pt_level);
	pte |= pte_encode_ps(pt_level);

	if (devmem)
		pte |= XE_PPGTT_PTE_DM;

	pte |= flags;

	return pte;
}

static const struct xe_pt_ops xelp_pt_ops = {
	.pte_encode_bo = xelp_pte_encode_bo,
	.pte_encode_vma = xelp_pte_encode_vma,
	.pte_encode_addr = xelp_pte_encode_addr,
	.pde_encode_bo = xelp_pde_encode_bo,
};

static void vm_destroy_work_func(struct work_struct *w);

/**
 * xe_vm_create_scratch() - Setup a scratch memory pagetable tree for the
 * given tile and vm.
 * @xe: xe device.
 * @tile: tile to set up for.
 * @vm: vm to set up for.
 * @exec: The struct drm_exec object used to lock the vm resv.
 *
 * Sets up a pagetable tree with one page-table per level and a single
 * leaf PTE. All pagetable entries point to the single page-table or,
 * for MAX_HUGEPTE_LEVEL, a NULL huge PTE returning 0 on read and
 * writes become NOPs.
 *
 * Return: 0 on success, negative error code on error.
 */
static int xe_vm_create_scratch(struct xe_device *xe, struct xe_tile *tile,
				struct xe_vm *vm, struct drm_exec *exec)
{
	u8 id = tile->id;
	int i;

	for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; i++) {
		vm->scratch_pt[id][i] = xe_pt_create(vm, tile, i, exec);
		if (IS_ERR(vm->scratch_pt[id][i])) {
			int err = PTR_ERR(vm->scratch_pt[id][i]);

			vm->scratch_pt[id][i] = NULL;
			return err;
		}
		xe_pt_populate_empty(tile, vm, vm->scratch_pt[id][i]);
	}

	return 0;
}
ALLOW_ERROR_INJECTION(xe_vm_create_scratch, ERRNO);

static void xe_vm_free_scratch(struct xe_vm *vm)
{
	struct xe_tile *tile;
	u8 id;

	if (!xe_vm_has_scratch(vm))
		return;

	for_each_tile(tile, vm->xe, id) {
		u32 i;

		if (!vm->pt_root[id])
			continue;

		for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; ++i)
			if (vm->scratch_pt[id][i])
				xe_pt_destroy(vm->scratch_pt[id][i], vm->flags, NULL);
	}
}

static void xe_vm_pt_destroy(struct xe_vm *vm)
{
	struct xe_tile *tile;
	u8 id;

	xe_vm_assert_held(vm);

	for_each_tile(tile, vm->xe, id) {
		if (vm->pt_root[id]) {
			xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
			vm->pt_root[id] = NULL;
		}
	}
}

static void xe_vm_init_prove_locking(struct xe_device *xe, struct xe_vm *vm)
{
	if (!IS_ENABLED(CONFIG_PROVE_LOCKING))
		return;

	fs_reclaim_acquire(GFP_KERNEL);
	might_lock(&vm->exec_queues.lock);
	fs_reclaim_release(GFP_KERNEL);

	down_read(&vm->exec_queues.lock);
	might_lock(&xe_root_mmio_gt(xe)->uc.guc.ct.lock);
	up_read(&vm->exec_queues.lock);
}

struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef)
{
	struct drm_gem_object *vm_resv_obj;
	struct xe_validation_ctx ctx;
	struct drm_exec exec;
	struct xe_vm *vm;
	int err;
	struct xe_tile *tile;
	u8 id;

	/*
	 * Since the GSCCS is not user-accessible, we don't expect a GSC VM to
	 * ever be in faulting mode.
	 */
	xe_assert(xe, !((flags & XE_VM_FLAG_GSC) && (flags & XE_VM_FLAG_FAULT_MODE)));

	vm = kzalloc(sizeof(*vm), GFP_KERNEL);
	if (!vm)
		return ERR_PTR(-ENOMEM);

	vm->xe = xe;

	vm->size = 1ull << xe->info.va_bits;
	vm->flags = flags;

	if (xef)
		vm->xef = xe_file_get(xef);
	/**
	 * GSC VMs are kernel-owned, only used for PXP ops and can sometimes be
	 * manipulated under the PXP mutex. However, the PXP mutex can be taken
	 * under a user-VM lock when the PXP session is started at exec_queue
	 * creation time. Those are different VMs and therefore there is no risk
	 * of deadlock, but we need to tell lockdep that this is the case or it
	 * will print a warning.
	 */
	if (flags & XE_VM_FLAG_GSC) {
		static struct lock_class_key gsc_vm_key;

		__init_rwsem(&vm->lock, "gsc_vm", &gsc_vm_key);
	} else {
		init_rwsem(&vm->lock);
	}
	mutex_init(&vm->snap_mutex);

	INIT_LIST_HEAD(&vm->rebind_list);

	INIT_LIST_HEAD(&vm->userptr.repin_list);
	INIT_LIST_HEAD(&vm->userptr.invalidated);
	spin_lock_init(&vm->userptr.invalidated_lock);

	INIT_LIST_HEAD(&vm->faults.list);
	spin_lock_init(&vm->faults.lock);

	ttm_lru_bulk_move_init(&vm->lru_bulk_move);

	INIT_WORK(&vm->destroy_work, vm_destroy_work_func);

	INIT_LIST_HEAD(&vm->preempt.exec_queues);
	for (id = 0; id < XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE; ++id)
		INIT_LIST_HEAD(&vm->exec_queues.list[id]);
	if (flags & XE_VM_FLAG_FAULT_MODE)
		vm->preempt.min_run_period_ms = xe->min_run_period_pf_ms;
	else
		vm->preempt.min_run_period_ms = xe->min_run_period_lr_ms;

	init_rwsem(&vm->exec_queues.lock);
	xe_vm_init_prove_locking(xe, vm);

	for_each_tile(tile, xe, id)
		xe_range_fence_tree_init(&vm->rftree[id]);

	vm->pt_ops = &xelp_pt_ops;

	/*
	 * Long-running workloads are not protected by the scheduler references.
	 * By design, run_job for long-running workloads returns NULL and the
	 * scheduler drops all the references of it, hence protecting the VM
	 * for this case is necessary.
	 */
	if (flags & XE_VM_FLAG_LR_MODE) {
		INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
		xe_pm_runtime_get_noresume(xe);
		INIT_LIST_HEAD(&vm->preempt.pm_activate_link);
	}

	err = xe_svm_init(vm);
	if (err)
		goto err_no_resv;

	vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
	if (!vm_resv_obj) {
		err = -ENOMEM;
		goto err_svm_fini;
	}

	drm_gpuvm_init(&vm->gpuvm, "Xe VM", DRM_GPUVM_RESV_PROTECTED, &xe->drm,
		       vm_resv_obj, 0, vm->size, 0, 0, &gpuvm_ops);

	drm_gem_object_put(vm_resv_obj);

	err = 0;
	xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {.interruptible = true},
			    err) {
		err = xe_vm_drm_exec_lock(vm, &exec);
		drm_exec_retry_on_contention(&exec);

		if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
			vm->flags |= XE_VM_FLAG_64K;

		for_each_tile(tile, xe, id) {
			if (flags & XE_VM_FLAG_MIGRATION &&
			    tile->id != XE_VM_FLAG_TILE_ID(flags))
				continue;

			vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level,
						       &exec);
			if (IS_ERR(vm->pt_root[id])) {
				err = PTR_ERR(vm->pt_root[id]);
				vm->pt_root[id] = NULL;
				xe_vm_pt_destroy(vm);
				drm_exec_retry_on_contention(&exec);
				xe_validation_retry_on_oom(&ctx, &err);
				break;
			}
		}
		if (err)
			break;

		if (xe_vm_has_scratch(vm)) {
			for_each_tile(tile, xe, id) {
				if (!vm->pt_root[id])
					continue;

				err = xe_vm_create_scratch(xe, tile, vm, &exec);
				if (err) {
					xe_vm_free_scratch(vm);
					xe_vm_pt_destroy(vm);
					drm_exec_retry_on_contention(&exec);
					xe_validation_retry_on_oom(&ctx, &err);
					break;
				}
			}
			if (err)
				break;
			vm->batch_invalidate_tlb = true;
		}

		if (vm->flags & XE_VM_FLAG_LR_MODE) {
			INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
			vm->batch_invalidate_tlb = false;
		}

		/* Fill pt_root after allocating scratch tables */
		for_each_tile(tile, xe, id) {
			if (!vm->pt_root[id])
				continue;

			xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
		}
	}
	if (err)
		goto err_close;

	/* Kernel migration VM shouldn't have a circular loop.. */
	if (!(flags & XE_VM_FLAG_MIGRATION)) {
		for_each_tile(tile, xe, id) {
			struct xe_exec_queue *q;
			u32 create_flags = EXEC_QUEUE_FLAG_VM;

			if (!vm->pt_root[id])
				continue;

			if (!xef) /* Not from userspace */
				create_flags |= EXEC_QUEUE_FLAG_KERNEL;

			q = xe_exec_queue_create_bind(xe, tile, vm, create_flags, 0);
			if (IS_ERR(q)) {
				err = PTR_ERR(q);
				goto err_close;
			}
			vm->q[id] = q;
		}
	}

	if (xef && xe->info.has_asid) {
		u32 asid;

		down_write(&xe->usm.lock);
		err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
				      XA_LIMIT(1, XE_MAX_ASID - 1),
				      &xe->usm.next_asid, GFP_NOWAIT);
		up_write(&xe->usm.lock);
		if (err < 0)
			goto err_close;

		vm->usm.asid = asid;
	}

	trace_xe_vm_create(vm);

	return vm;

err_close:
	xe_vm_close_and_put(vm);
	return ERR_PTR(err);

err_svm_fini:
	if (flags & XE_VM_FLAG_FAULT_MODE) {
		vm->size = 0; /* close the vm */
		xe_svm_fini(vm);
	}
err_no_resv:
	mutex_destroy(&vm->snap_mutex);
	for_each_tile(tile, xe, id)
		xe_range_fence_tree_fini(&vm->rftree[id]);
	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
	if (vm->xef)
		xe_file_put(vm->xef);
	kfree(vm);
	if (flags & XE_VM_FLAG_LR_MODE)
		xe_pm_runtime_put(xe);
	return ERR_PTR(err);
}

static void xe_vm_close(struct xe_vm *vm)
{
	struct xe_device *xe = vm->xe;
	bool bound;
	int idx;

	bound = drm_dev_enter(&xe->drm, &idx);

	down_write(&vm->lock);
	if (xe_vm_in_fault_mode(vm))
		xe_svm_notifier_lock(vm);

	vm->size = 0;

	if (!((vm->flags & XE_VM_FLAG_MIGRATION))) {
		struct xe_tile *tile;
		struct xe_gt *gt;
		u8 id;

		/* Wait for pending binds */
		dma_resv_wait_timeout(xe_vm_resv(vm),
				      DMA_RESV_USAGE_BOOKKEEP,
				      false, MAX_SCHEDULE_TIMEOUT);

		if (bound) {
			for_each_tile(tile, xe, id)
				if (vm->pt_root[id])
					xe_pt_clear(xe, vm->pt_root[id]);

			for_each_gt(gt, xe, id)
				xe_tlb_inval_vm(&gt->tlb_inval, vm);
		}
	}

	if (xe_vm_in_fault_mode(vm))
		xe_svm_notifier_unlock(vm);
	up_write(&vm->lock);

	if (bound)
		drm_dev_exit(idx);
}

void xe_vm_close_and_put(struct xe_vm *vm)
{
	LIST_HEAD(contested);
	struct xe_device *xe = vm->xe;
	struct xe_tile *tile;
	struct xe_vma *vma, *next_vma;
	struct drm_gpuva *gpuva, *next;
	u8 id;

	xe_assert(xe, !vm->preempt.num_exec_queues);

	xe_vm_close(vm);
	if (xe_vm_in_preempt_fence_mode(vm)) {
		mutex_lock(&xe->rebind_resume_lock);
		list_del_init(&vm->preempt.pm_activate_link);
		mutex_unlock(&xe->rebind_resume_lock);
		flush_work(&vm->preempt.rebind_work);
	}
	if (xe_vm_in_fault_mode(vm))
		xe_svm_close(vm);

	down_write(&vm->lock);
	for_each_tile(tile, xe, id) {
		if (vm->q[id]) {
			int i;

			xe_exec_queue_last_fence_put(vm->q[id], vm);
			for_each_tlb_inval(i)
				xe_exec_queue_tlb_inval_last_fence_put(vm->q[id], vm, i);
		}
	}
	up_write(&vm->lock);

	for_each_tile(tile, xe, id) {
		if (vm->q[id]) {
			xe_exec_queue_kill(vm->q[id]);
			xe_exec_queue_put(vm->q[id]);
			vm->q[id] = NULL;
		}
	}

	down_write(&vm->lock);
	xe_vm_lock(vm, false);
	drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
		vma = gpuva_to_vma(gpuva);

		if (xe_vma_has_no_bo(vma)) {
			xe_svm_notifier_lock(vm);
			vma->gpuva.flags |= XE_VMA_DESTROYED;
			xe_svm_notifier_unlock(vm);
		}

		xe_vm_remove_vma(vm, vma);

		/* easy case, remove from VMA? */
		if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
			list_del_init(&vma->combined_links.rebind);
			xe_vma_destroy(vma, NULL);
			continue;
		}

		list_move_tail(&vma->combined_links.destroy, &contested);
		vma->gpuva.flags |= XE_VMA_DESTROYED;
	}

	/*
	 * All vm operations will add shared fences to resv.
	 * The only exception is eviction for a shared object,
	 * but even so, the unbind when evicted would still
	 * install a fence to resv. Hence it's safe to
	 * destroy the pagetables immediately.
	 */
	xe_vm_free_scratch(vm);
	xe_vm_pt_destroy(vm);
	xe_vm_unlock(vm);

	/*
	 * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
	 * Since we hold a refcount to the bo, we can remove and free
	 * the members safely without locking.
	 */
	list_for_each_entry_safe(vma, next_vma, &contested,
				 combined_links.destroy) {
		list_del_init(&vma->combined_links.destroy);
		xe_vma_destroy_unlocked(vma);
	}

	xe_svm_fini(vm);

	up_write(&vm->lock);

	down_write(&xe->usm.lock);
	if (vm->usm.asid) {
		void *lookup;

		xe_assert(xe, xe->info.has_asid);
		xe_assert(xe, !(vm->flags & XE_VM_FLAG_MIGRATION));

		lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
		xe_assert(xe, lookup == vm);
	}
	up_write(&xe->usm.lock);

	xe_vm_clear_fault_entries(vm);

	for_each_tile(tile, xe, id)
		xe_range_fence_tree_fini(&vm->rftree[id]);

	xe_vm_put(vm);
}

static void vm_destroy_work_func(struct work_struct *w)
{
	struct xe_vm *vm =
		container_of(w, struct xe_vm, destroy_work);
	struct xe_device *xe = vm->xe;
	struct xe_tile *tile;
	u8 id;

	/* xe_vm_close_and_put was not called? */
	xe_assert(xe, !vm->size);

	if (xe_vm_in_preempt_fence_mode(vm))
		flush_work(&vm->preempt.rebind_work);

	mutex_destroy(&vm->snap_mutex);

	if (vm->flags & XE_VM_FLAG_LR_MODE)
		xe_pm_runtime_put(xe);

	for_each_tile(tile, xe, id)
		XE_WARN_ON(vm->pt_root[id]);

	trace_xe_vm_free(vm);

	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);

	if (vm->xef)
		xe_file_put(vm->xef);

	kfree(vm);
}

static void xe_vm_free(struct drm_gpuvm *gpuvm)
{
	struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);

	/* To destroy the VM we need to be able to sleep */
	queue_work(system_dfl_wq, &vm->destroy_work);
}

struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
{
	struct xe_vm *vm;

	mutex_lock(&xef->vm.lock);
	vm = xa_load(&xef->vm.xa, id);
	if (vm)
		xe_vm_get(vm);
	mutex_unlock(&xef->vm.lock);

	return vm;
}

u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
{
	return vm->pt_ops->pde_encode_bo(vm->pt_root[tile->id]->bo, 0);
}

static struct xe_exec_queue *
to_wait_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
{
	return q ? q : vm->q[0];
}

static struct xe_user_fence *
find_ufence_get(struct xe_sync_entry *syncs, u32 num_syncs)
{
	unsigned int i;

	for (i = 0; i < num_syncs; i++) {
		struct xe_sync_entry *e = &syncs[i];

		if (xe_sync_is_ufence(e))
			return xe_sync_ufence_get(e);
	}

	return NULL;
}

#define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE | \
				    DRM_XE_VM_CREATE_FLAG_NO_VM_OVERCOMMIT)

int xe_vm_create_ioctl(struct drm_device *dev, void *data,
		       struct drm_file *file)
{
	struct xe_device *xe = to_xe_device(dev);
	struct xe_file *xef = to_xe_file(file);
	struct drm_xe_vm_create *args = data;
	struct xe_gt *wa_gt = xe_root_mmio_gt(xe);
	struct xe_vm *vm;
	u32 id;
	int err;
	u32 flags = 0;

	if (XE_IOCTL_DBG(xe, args->extensions))
		return -EINVAL;

	if (wa_gt && XE_GT_WA(wa_gt, 22014953428))
		args->flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE;

	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
			 !xe->info.has_usm))
		return -EINVAL;

	if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
		return -EINVAL;

	if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
		return -EINVAL;

	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE &&
			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
			 !xe->info.needs_scratch))
		return -EINVAL;

	if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE) &&
			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE))
		return -EINVAL;

	if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE) &&
			 args->flags & DRM_XE_VM_CREATE_FLAG_NO_VM_OVERCOMMIT))
		return -EINVAL;

	if (args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE)
		flags |= XE_VM_FLAG_SCRATCH_PAGE;
	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
		flags |= XE_VM_FLAG_LR_MODE;
	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
		flags |= XE_VM_FLAG_FAULT_MODE;
	if (args->flags & DRM_XE_VM_CREATE_FLAG_NO_VM_OVERCOMMIT)
		flags |= XE_VM_FLAG_NO_VM_OVERCOMMIT;

	vm = xe_vm_create(xe, flags, xef);
	if (IS_ERR(vm))
		return PTR_ERR(vm);

#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
	/* Warning: Security issue - never enable by default */
	args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
#endif

	/* user id alloc must always be last in ioctl to prevent UAF */
	err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
	if (err)
		goto err_close_and_put;

	args->vm_id = id;

	return 0;

err_close_and_put:
	xe_vm_close_and_put(vm);

	return err;
}

int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
			struct drm_file *file)
{
	struct xe_device *xe = to_xe_device(dev);
	struct xe_file *xef = to_xe_file(file);
	struct drm_xe_vm_destroy *args = data;
	struct xe_vm *vm;
	int err = 0;

	if (XE_IOCTL_DBG(xe, args->pad) ||
	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
		return -EINVAL;

	mutex_lock(&xef->vm.lock);
	vm = xa_load(&xef->vm.xa, args->vm_id);
	if (XE_IOCTL_DBG(xe, !vm))
		err = -ENOENT;
	else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
		err = -EBUSY;
	else
		xa_erase(&xef->vm.xa, args->vm_id);
	mutex_unlock(&xef->vm.lock);

	if (!err)
		xe_vm_close_and_put(vm);

	return err;
}

static int xe_vm_query_vmas(struct xe_vm *vm, u64 start, u64 end)
{
	struct drm_gpuva *gpuva;
	u32 num_vmas = 0;

	lockdep_assert_held(&vm->lock);
	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end)
		num_vmas++;

	return num_vmas;
}

static int get_mem_attrs(struct xe_vm *vm, u32 *num_vmas, u64 start,
			 u64 end, struct drm_xe_mem_range_attr *attrs)
{
	struct drm_gpuva *gpuva;
	int i = 0;

	lockdep_assert_held(&vm->lock);

	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end) {
		struct xe_vma *vma = gpuva_to_vma(gpuva);

		if (i == *num_vmas)
			return -ENOSPC;

		attrs[i].start = xe_vma_start(vma);
		attrs[i].end = xe_vma_end(vma);
		attrs[i].atomic.val = vma->attr.atomic_access;
		attrs[i].pat_index.val = vma->attr.pat_index;
		attrs[i].preferred_mem_loc.devmem_fd = vma->attr.preferred_loc.devmem_fd;
		attrs[i].preferred_mem_loc.migration_policy =
		vma->attr.preferred_loc.migration_policy;

		i++;
	}

	*num_vmas = i;
	return 0;
}

int xe_vm_query_vmas_attrs_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
{
	struct xe_device *xe = to_xe_device(dev);
	struct xe_file *xef = to_xe_file(file);
	struct drm_xe_mem_range_attr *mem_attrs;
	struct drm_xe_vm_query_mem_range_attr *args = data;
	u64 __user *attrs_user = u64_to_user_ptr(args->vector_of_mem_attr);
	struct xe_vm *vm;
	int err = 0;

	if (XE_IOCTL_DBG(xe,
			 ((args->num_mem_ranges == 0 &&
			  (attrs_user || args->sizeof_mem_range_attr != 0)) ||
			 (args->num_mem_ranges > 0 &&
			  (!attrs_user ||
			   args->sizeof_mem_range_attr !=
			   sizeof(struct drm_xe_mem_range_attr))))))
		return -EINVAL;

	vm = xe_vm_lookup(xef, args->vm_id);
	if (XE_IOCTL_DBG(xe, !vm))
		return -EINVAL;

	err = down_read_interruptible(&vm->lock);
	if (err)
		goto put_vm;

	attrs_user = u64_to_user_ptr(args->vector_of_mem_attr);

	if (args->num_mem_ranges == 0 && !attrs_user) {
		args->num_mem_ranges = xe_vm_query_vmas(vm, args->start, args->start + args->range);
		args->sizeof_mem_range_attr = sizeof(struct drm_xe_mem_range_attr);
		goto unlock_vm;
	}

	mem_attrs = kvmalloc_array(args->num_mem_ranges, args->sizeof_mem_range_attr,
				   GFP_KERNEL | __GFP_ACCOUNT |
				   __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
	if (!mem_attrs) {
		err = args->num_mem_ranges > 1 ? -ENOBUFS : -ENOMEM;
		goto unlock_vm;
	}

	memset(mem_attrs, 0, args->num_mem_ranges * args->sizeof_mem_range_attr);
	err = get_mem_attrs(vm, &args->num_mem_ranges, args->start,
			    args->start + args->range, mem_attrs);
	if (err)
		goto free_mem_attrs;

	err = copy_to_user(attrs_user, mem_attrs,
			   args->sizeof_mem_range_attr * args->num_mem_ranges);
	if (err)
		err = -EFAULT;

free_mem_attrs:
	kvfree(mem_attrs);
unlock_vm:
	up_read(&vm->lock);
put_vm:
	xe_vm_put(vm);
	return err;
}

static bool vma_matches(struct xe_vma *vma, u64 page_addr)
{
	if (page_addr > xe_vma_end(vma) - 1 ||
	    page_addr + SZ_4K - 1 < xe_vma_start(vma))
		return false;

	return true;
}

/**
 * xe_vm_find_vma_by_addr() - Find a VMA by its address
 *
 * @vm: the xe_vm the vma belongs to
 * @page_addr: address to look up
 */
struct xe_vma *xe_vm_find_vma_by_addr(struct xe_vm *vm, u64 page_addr)
{
	struct xe_vma *vma = NULL;

	if (vm->usm.last_fault_vma) {   /* Fast lookup */
		if (vma_matches(vm->usm.last_fault_vma, page_addr))
			vma = vm->usm.last_fault_vma;
	}
	if (!vma)
		vma = xe_vm_find_overlapping_vma(vm, page_addr, SZ_4K);

	return vma;
}

static const u32 region_to_mem_type[] = {
	XE_PL_TT,
	XE_PL_VRAM0,
	XE_PL_VRAM1,
};

static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
			     bool post_commit)
{
	xe_svm_notifier_lock(vm);
	vma->gpuva.flags |= XE_VMA_DESTROYED;
	xe_svm_notifier_unlock(vm);
	if (post_commit)
		xe_vm_remove_vma(vm, vma);
}

#undef ULL
#define ULL	unsigned long long

#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
{
	struct xe_vma *vma;

	switch (op->op) {
	case DRM_GPUVA_OP_MAP:
		vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
		       (ULL)op->map.va.addr, (ULL)op->map.va.range);
		break;
	case DRM_GPUVA_OP_REMAP:
		vma = gpuva_to_vma(op->remap.unmap->va);
		vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
		       op->remap.unmap->keep ? 1 : 0);
		if (op->remap.prev)
			vm_dbg(&xe->drm,
			       "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
			       (ULL)op->remap.prev->va.addr,
			       (ULL)op->remap.prev->va.range);
		if (op->remap.next)
			vm_dbg(&xe->drm,
			       "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
			       (ULL)op->remap.next->va.addr,
			       (ULL)op->remap.next->va.range);
		break;
	case DRM_GPUVA_OP_UNMAP:
		vma = gpuva_to_vma(op->unmap.va);
		vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
		       op->unmap.keep ? 1 : 0);
		break;
	case DRM_GPUVA_OP_PREFETCH:
		vma = gpuva_to_vma(op->prefetch.va);
		vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
		break;
	default:
		drm_warn(&xe->drm, "NOT POSSIBLE\n");
	}
}
#else
static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
{
}
#endif

static bool __xe_vm_needs_clear_scratch_pages(struct xe_vm *vm, u32 bind_flags)
{
	if (!xe_vm_in_fault_mode(vm))
		return false;

	if (!xe_vm_has_scratch(vm))
		return false;

	if (bind_flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE)
		return false;

	return true;
}

static void xe_svm_prefetch_gpuva_ops_fini(struct drm_gpuva_ops *ops)
{
	struct drm_gpuva_op *__op;

	drm_gpuva_for_each_op(__op, ops) {
		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);

		xe_vma_svm_prefetch_op_fini(op);
	}
}

/*
 * Create operations list from IOCTL arguments, setup operations fields so parse
 * and commit steps are decoupled from IOCTL arguments. This step can fail.
 */
static struct drm_gpuva_ops *
vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_vma_ops *vops,
			 struct xe_bo *bo, u64 bo_offset_or_userptr,
			 u64 addr, u64 range,
			 u32 operation, u32 flags,
			 u32 prefetch_region, u16 pat_index)
{
	struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
	struct drm_gpuva_ops *ops;
	struct drm_gpuva_op *__op;
	struct drm_gpuvm_bo *vm_bo;
	u64 range_start = addr;
	u64 range_end = addr + range;
	int err;

	lockdep_assert_held_write(&vm->lock);

	vm_dbg(&vm->xe->drm,
	       "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
	       operation, (ULL)addr, (ULL)range,
	       (ULL)bo_offset_or_userptr);

	switch (operation) {
	case DRM_XE_VM_BIND_OP_MAP:
		if (flags & DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR) {
			xe_vm_find_cpu_addr_mirror_vma_range(vm, &range_start, &range_end);
			vops->flags |= XE_VMA_OPS_FLAG_ALLOW_SVM_UNMAP;
		}

		fallthrough;
	case DRM_XE_VM_BIND_OP_MAP_USERPTR: {
		struct drm_gpuvm_map_req map_req = {
			.map.va.addr = range_start,
			.map.va.range = range_end - range_start,
			.map.gem.obj = obj,
			.map.gem.offset = bo_offset_or_userptr,
		};

		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, &map_req);
		break;
	}
	case DRM_XE_VM_BIND_OP_UNMAP:
		ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
		break;
	case DRM_XE_VM_BIND_OP_PREFETCH:
		ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
		break;
	case DRM_XE_VM_BIND_OP_UNMAP_ALL:
		xe_assert(vm->xe, bo);

		err = xe_bo_lock(bo, true);
		if (err)
			return ERR_PTR(err);

		vm_bo = drm_gpuvm_bo_obtain_locked(&vm->gpuvm, obj);
		if (IS_ERR(vm_bo)) {
			xe_bo_unlock(bo);
			return ERR_CAST(vm_bo);
		}

		ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
		drm_gpuvm_bo_put(vm_bo);
		xe_bo_unlock(bo);
		break;
	default:
		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
		ops = ERR_PTR(-EINVAL);
	}
	if (IS_ERR(ops))
		return ops;

	drm_gpuva_for_each_op(__op, ops) {
		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);

		if (__op->op == DRM_GPUVA_OP_MAP) {
			op->map.immediate =
				flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE;
			if (flags & DRM_XE_VM_BIND_FLAG_READONLY)
				op->map.vma_flags |= XE_VMA_READ_ONLY;
			if (flags & DRM_XE_VM_BIND_FLAG_NULL)
				op->map.vma_flags |= DRM_GPUVA_SPARSE;
			if (flags & DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR)
				op->map.vma_flags |= XE_VMA_SYSTEM_ALLOCATOR;
			if (flags & DRM_XE_VM_BIND_FLAG_DUMPABLE)
				op->map.vma_flags |= XE_VMA_DUMPABLE;
			if (flags & DRM_XE_VM_BIND_FLAG_MADVISE_AUTORESET)
				op->map.vma_flags |= XE_VMA_MADV_AUTORESET;
			op->map.request_decompress = flags & DRM_XE_VM_BIND_FLAG_DECOMPRESS;
			op->map.pat_index = pat_index;
			op->map.invalidate_on_bind =
				__xe_vm_needs_clear_scratch_pages(vm, flags);
		} else if (__op->op == DRM_GPUVA_OP_PREFETCH) {
			struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
			struct xe_tile *tile;
			struct xe_svm_range *svm_range;
			struct drm_gpusvm_ctx ctx = {};
			struct drm_pagemap *dpagemap = NULL;
			u8 id, tile_mask = 0;
			u32 i;

			if (!xe_vma_is_cpu_addr_mirror(vma)) {
				op->prefetch.region = prefetch_region;
				break;
			}

			ctx.read_only = xe_vma_read_only(vma);
			ctx.devmem_possible = IS_DGFX(vm->xe) &&
					      IS_ENABLED(CONFIG_DRM_XE_PAGEMAP);

			for_each_tile(tile, vm->xe, id)
				tile_mask |= 0x1 << id;

			xa_init_flags(&op->prefetch_range.range, XA_FLAGS_ALLOC);
			op->prefetch_range.ranges_count = 0;

			if (prefetch_region == DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC) {
				dpagemap = xe_vma_resolve_pagemap(vma,
								  xe_device_get_root_tile(vm->xe));
			} else if (prefetch_region) {
				tile = &vm->xe->tiles[region_to_mem_type[prefetch_region] -
						      XE_PL_VRAM0];
				dpagemap = xe_tile_local_pagemap(tile);
			}

			op->prefetch_range.dpagemap = dpagemap;
alloc_next_range:
			svm_range = xe_svm_range_find_or_insert(vm, addr, vma, &ctx);

			if (PTR_ERR(svm_range) == -ENOENT) {
				u64 ret = xe_svm_find_vma_start(vm, addr, range_end, vma);

				addr = ret == ULONG_MAX ? 0 : ret;
				if (addr)
					goto alloc_next_range;
				else
					goto print_op_label;
			}

			if (IS_ERR(svm_range)) {
				err = PTR_ERR(svm_range);
				goto unwind_prefetch_ops;
			}

			if (xe_svm_range_validate(vm, svm_range, tile_mask, dpagemap)) {
				xe_svm_range_debug(svm_range, "PREFETCH - RANGE IS VALID");
				goto check_next_range;
			}

			err = xa_alloc(&op->prefetch_range.range,
				       &i, svm_range, xa_limit_32b,
				       GFP_KERNEL);

			if (err)
				goto unwind_prefetch_ops;

			op->prefetch_range.ranges_count++;
			vops->flags |= XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH;
			xe_svm_range_debug(svm_range, "PREFETCH - RANGE CREATED");
check_next_range:
			if (range_end > xe_svm_range_end(svm_range) &&
			    xe_svm_range_end(svm_range) < xe_vma_end(vma)) {
				addr = xe_svm_range_end(svm_range);
				goto alloc_next_range;
			}
		}
print_op_label:
		print_op(vm->xe, __op);
	}

	return ops;

unwind_prefetch_ops:
	xe_svm_prefetch_gpuva_ops_fini(ops);
	drm_gpuva_ops_free(&vm->gpuvm, ops);
	return ERR_PTR(err);
}

ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_create, ERRNO);

static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
			      struct xe_vma_mem_attr *attr, unsigned int flags)
{
	struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
	struct xe_validation_ctx ctx;
	struct drm_exec exec;
	struct xe_vma *vma;
	int err = 0;

	lockdep_assert_held_write(&vm->lock);

	if (bo) {
		err = 0;
		xe_validation_guard(&ctx, &vm->xe->val, &exec,
				    (struct xe_val_flags) {.interruptible = true}, err) {
			if (!bo->vm) {
				err = drm_exec_lock_obj(&exec, xe_vm_obj(vm));
				drm_exec_retry_on_contention(&exec);
			}
			if (!err) {
				err = drm_exec_lock_obj(&exec, &bo->ttm.base);
				drm_exec_retry_on_contention(&exec);
			}
			if (err)
				return ERR_PTR(err);

			vma = xe_vma_create(vm, bo, op->gem.offset,
					    op->va.addr, op->va.addr +
					    op->va.range - 1, attr, flags);
			if (IS_ERR(vma))
				return vma;

			if (!bo->vm) {
				err = add_preempt_fences(vm, bo);
				if (err) {
					prep_vma_destroy(vm, vma, false);
					xe_vma_destroy(vma, NULL);
				}
			}
		}
		if (err)
			return ERR_PTR(err);
	} else {
		vma = xe_vma_create(vm, NULL, op->gem.offset,
				    op->va.addr, op->va.addr +
				    op->va.range - 1, attr, flags);
		if (IS_ERR(vma))
			return vma;

		if (xe_vma_is_userptr(vma)) {
			err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
			/*
			 * -EBUSY has dedicated meaning that a user fence
			 * attached to the VMA is busy, in practice
			 * xe_vma_userptr_pin_pages can only fail with -EBUSY if
			 * we are low on memory so convert this to -ENOMEM.
			 */
			if (err == -EBUSY)
				err = -ENOMEM;
		}
	}
	if (err) {
		prep_vma_destroy(vm, vma, false);
		xe_vma_destroy_unlocked(vma);
		vma = ERR_PTR(err);
	}

	return vma;
}

static u64 xe_vma_max_pte_size(struct xe_vma *vma)
{
	if (vma->gpuva.flags & XE_VMA_PTE_1G)
		return SZ_1G;
	else if (vma->gpuva.flags & (XE_VMA_PTE_2M | XE_VMA_PTE_COMPACT))
		return SZ_2M;
	else if (vma->gpuva.flags & XE_VMA_PTE_64K)
		return SZ_64K;
	else if (vma->gpuva.flags & XE_VMA_PTE_4K)
		return SZ_4K;

	return SZ_1G;	/* Uninitialized, used max size */
}

static void xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
{
	switch (size) {
	case SZ_1G:
		vma->gpuva.flags |= XE_VMA_PTE_1G;
		break;
	case SZ_2M:
		vma->gpuva.flags |= XE_VMA_PTE_2M;
		break;
	case SZ_64K:
		vma->gpuva.flags |= XE_VMA_PTE_64K;
		break;
	case SZ_4K:
		vma->gpuva.flags |= XE_VMA_PTE_4K;
		break;
	}
}

static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
{
	int err = 0;

	lockdep_assert_held_write(&vm->lock);

	switch (op->base.op) {
	case DRM_GPUVA_OP_MAP:
		err |= xe_vm_insert_vma(vm, op->map.vma);
		if (!err)
			op->flags |= XE_VMA_OP_COMMITTED;
		break;
	case DRM_GPUVA_OP_REMAP:
	{
		u8 tile_present =
			gpuva_to_vma(op->base.remap.unmap->va)->tile_present;

		prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
				 true);
		op->flags |= XE_VMA_OP_COMMITTED;

		if (op->remap.prev) {
			err |= xe_vm_insert_vma(vm, op->remap.prev);
			if (!err)
				op->flags |= XE_VMA_OP_PREV_COMMITTED;
			if (!err && op->remap.skip_prev) {
				op->remap.prev->tile_present =
					tile_present;
			}
		}
		if (op->remap.next) {
			err |= xe_vm_insert_vma(vm, op->remap.next);
			if (!err)
				op->flags |= XE_VMA_OP_NEXT_COMMITTED;
			if (!err && op->remap.skip_next) {
				op->remap.next->tile_present =
					tile_present;
			}
		}

		/*
		 * Adjust for partial unbind after removing VMA from VM. In case
		 * of unwind we might need to undo this later.
		 */
		if (!err) {
			op->base.remap.unmap->va->va.addr = op->remap.start;
			op->base.remap.unmap->va->va.range = op->remap.range;
		}
		break;
	}
	case DRM_GPUVA_OP_UNMAP:
		prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
		op->flags |= XE_VMA_OP_COMMITTED;
		break;
	case DRM_GPUVA_OP_PREFETCH:
		op->flags |= XE_VMA_OP_COMMITTED;
		break;
	default:
		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
	}

	return err;
}

/**
 * xe_vma_has_default_mem_attrs - Check if a VMA has default memory attributes
 * @vma: Pointer to the xe_vma structure to check
 *
 * This function determines whether the given VMA (Virtual Memory Area)
 * has its memory attributes set to their default values. Specifically,
 * it checks the following conditions:
 *
 * - `atomic_access` is `DRM_XE_VMA_ATOMIC_UNDEFINED`
 * - `pat_index` is equal to `default_pat_index`
 * - `preferred_loc.devmem_fd` is `DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE`
 * - `preferred_loc.migration_policy` is `DRM_XE_MIGRATE_ALL_PAGES`
 *
 * Return: true if all attributes are at their default values, false otherwise.
 */
bool xe_vma_has_default_mem_attrs(struct xe_vma *vma)
{
	return (vma->attr.atomic_access == DRM_XE_ATOMIC_UNDEFINED &&
		vma->attr.pat_index ==  vma->attr.default_pat_index &&
		vma->attr.preferred_loc.devmem_fd == DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE &&
		vma->attr.preferred_loc.migration_policy == DRM_XE_MIGRATE_ALL_PAGES);
}

static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
				   struct xe_vma_ops *vops)
{
	struct xe_device *xe = vm->xe;
	struct drm_gpuva_op *__op;
	struct xe_tile *tile;
	u8 id, tile_mask = 0;
	int err = 0;

	lockdep_assert_held_write(&vm->lock);

	for_each_tile(tile, vm->xe, id)
		tile_mask |= 0x1 << id;

	drm_gpuva_for_each_op(__op, ops) {
		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
		struct xe_vma *vma;
		unsigned int flags = 0;

		INIT_LIST_HEAD(&op->link);
		list_add_tail(&op->link, &vops->list);
		op->tile_mask = tile_mask;

		switch (op->base.op) {
		case DRM_GPUVA_OP_MAP:
		{
			struct xe_vma_mem_attr default_attr = {
				.preferred_loc = {
					.devmem_fd = DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE,
					.migration_policy = DRM_XE_MIGRATE_ALL_PAGES,
				},
				.atomic_access = DRM_XE_ATOMIC_UNDEFINED,
				.default_pat_index = op->map.pat_index,
				.pat_index = op->map.pat_index,
				.purgeable_state = XE_MADV_PURGEABLE_WILLNEED,
			};

			flags |= op->map.vma_flags & XE_VMA_CREATE_MASK;

			vma = new_vma(vm, &op->base.map, &default_attr,
				      flags);
			if (IS_ERR(vma))
				return PTR_ERR(vma);

			op->map.vma = vma;
			if (((op->map.immediate || !xe_vm_in_fault_mode(vm)) &&
			     !(op->map.vma_flags & XE_VMA_SYSTEM_ALLOCATOR)) ||
			    op->map.invalidate_on_bind)
				xe_vma_ops_incr_pt_update_ops(vops,
							      op->tile_mask, 1);
			break;
		}
		case DRM_GPUVA_OP_REMAP:
		{
			struct xe_vma *old =
				gpuva_to_vma(op->base.remap.unmap->va);
			bool skip = xe_vma_is_cpu_addr_mirror(old);
			u64 start = xe_vma_start(old), end = xe_vma_end(old);
			int num_remap_ops = 0;

			if (op->base.remap.prev)
				start = op->base.remap.prev->va.addr +
					op->base.remap.prev->va.range;
			if (op->base.remap.next)
				end = op->base.remap.next->va.addr;

			if (xe_vma_is_cpu_addr_mirror(old) &&
			    xe_svm_has_mapping(vm, start, end)) {
				if (vops->flags & XE_VMA_OPS_FLAG_MADVISE)
					xe_svm_unmap_address_range(vm, start, end);
				else
					return -EBUSY;
			}

			op->remap.start = xe_vma_start(old);
			op->remap.range = xe_vma_size(old);
			op->remap.old_start = op->remap.start;
			op->remap.old_range = op->remap.range;

			flags |= op->base.remap.unmap->va->flags & XE_VMA_CREATE_MASK;
			if (op->base.remap.prev) {
				vma = new_vma(vm, op->base.remap.prev,
					      &old->attr, flags);
				if (IS_ERR(vma))
					return PTR_ERR(vma);

				op->remap.prev = vma;

				/*
				 * Userptr creates a new SG mapping so
				 * we must also rebind.
				 */
				op->remap.skip_prev = skip ||
					(!xe_vma_is_userptr(old) &&
					IS_ALIGNED(xe_vma_end(vma),
						   xe_vma_max_pte_size(old)));
				if (op->remap.skip_prev) {
					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
					op->remap.range -=
						xe_vma_end(vma) -
						xe_vma_start(old);
					op->remap.start = xe_vma_end(vma);
					vm_dbg(&xe->drm, "REMAP:SKIP_PREV: addr=0x%016llx, range=0x%016llx",
					       (ULL)op->remap.start,
					       (ULL)op->remap.range);
				} else {
					num_remap_ops++;
				}
			}

			if (op->base.remap.next) {
				vma = new_vma(vm, op->base.remap.next,
					      &old->attr, flags);
				if (IS_ERR(vma))
					return PTR_ERR(vma);

				op->remap.next = vma;

				/*
				 * Userptr creates a new SG mapping so
				 * we must also rebind.
				 */
				op->remap.skip_next = skip ||
					(!xe_vma_is_userptr(old) &&
					IS_ALIGNED(xe_vma_start(vma),
						   xe_vma_max_pte_size(old)));
				if (op->remap.skip_next) {
					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
					op->remap.range -=
						xe_vma_end(old) -
						xe_vma_start(vma);
					vm_dbg(&xe->drm, "REMAP:SKIP_NEXT: addr=0x%016llx, range=0x%016llx",
					       (ULL)op->remap.start,
					       (ULL)op->remap.range);
				} else {
					num_remap_ops++;
				}
			}
			if (!skip)
				num_remap_ops++;

			xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, num_remap_ops);
			break;
		}
		case DRM_GPUVA_OP_UNMAP:
			vma = gpuva_to_vma(op->base.unmap.va);

			if (xe_vma_is_cpu_addr_mirror(vma) &&
			    xe_svm_has_mapping(vm, xe_vma_start(vma),
					       xe_vma_end(vma)) &&
			    !(vops->flags & XE_VMA_OPS_FLAG_ALLOW_SVM_UNMAP))
				return -EBUSY;

			if (!xe_vma_is_cpu_addr_mirror(vma))
				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);
			break;
		case DRM_GPUVA_OP_PREFETCH:
			vma = gpuva_to_vma(op->base.prefetch.va);

			if (xe_vma_is_userptr(vma)) {
				err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
				if (err)
					return err;
			}

			if (xe_vma_is_cpu_addr_mirror(vma))
				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask,
							      op->prefetch_range.ranges_count);
			else
				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);

			break;
		default:
			drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
		}

		err = xe_vma_op_commit(vm, op);
		if (err)
			return err;
	}

	return 0;
}

static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
			     bool post_commit, bool prev_post_commit,
			     bool next_post_commit)
{
	lockdep_assert_held_write(&vm->lock);

	switch (op->base.op) {
	case DRM_GPUVA_OP_MAP:
		if (op->map.vma) {
			prep_vma_destroy(vm, op->map.vma, post_commit);
			xe_vma_destroy_unlocked(op->map.vma);
		}
		break;
	case DRM_GPUVA_OP_UNMAP:
	{
		struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);

		if (vma) {
			xe_svm_notifier_lock(vm);
			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
			xe_svm_notifier_unlock(vm);
			if (post_commit)
				xe_vm_insert_vma(vm, vma);
		}
		break;
	}
	case DRM_GPUVA_OP_REMAP:
	{
		struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);

		if (op->remap.prev) {
			prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
			xe_vma_destroy_unlocked(op->remap.prev);
		}
		if (op->remap.next) {
			prep_vma_destroy(vm, op->remap.next, next_post_commit);
			xe_vma_destroy_unlocked(op->remap.next);
		}
		if (vma) {
			xe_svm_notifier_lock(vm);
			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
			xe_svm_notifier_unlock(vm);
			if (post_commit) {
				/*
				 * Restore the old va range, in case of the
				 * prev/next skip optimisation. Otherwise what
				 * we re-insert here could be smaller than the
				 * original range.
				 */
				op->base.remap.unmap->va->va.addr =
					op->remap.old_start;
				op->base.remap.unmap->va->va.range =
					op->remap.old_range;
				xe_vm_insert_vma(vm, vma);
			}
		}
		break;
	}
	case DRM_GPUVA_OP_PREFETCH:
		/* Nothing to do */
		break;
	default:
		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
	}
}

static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
				     struct drm_gpuva_ops **ops,
				     int num_ops_list)
{
	int i;

	for (i = num_ops_list - 1; i >= 0; --i) {
		struct drm_gpuva_ops *__ops = ops[i];
		struct drm_gpuva_op *__op;

		if (!__ops)
			continue;

		drm_gpuva_for_each_op_reverse(__op, __ops) {
			struct xe_vma_op *op = gpuva_op_to_vma_op(__op);

			xe_vma_op_unwind(vm, op,
					 op->flags & XE_VMA_OP_COMMITTED,
					 op->flags & XE_VMA_OP_PREV_COMMITTED,
					 op->flags & XE_VMA_OP_NEXT_COMMITTED);
		}
	}
}

/**
 * struct xe_vma_lock_and_validate_flags - Flags for vma_lock_and_validate()
 * @res_evict: Allow evicting resources during validation
 * @validate: Perform BO validation
 * @request_decompress: Request BO decompression
 * @check_purged: Reject operation if BO is purged
 */
struct xe_vma_lock_and_validate_flags {
	u32 res_evict : 1;
	u32 validate : 1;
	u32 request_decompress : 1;
	u32 check_purged : 1;
};

static int vma_lock_and_validate(struct drm_exec *exec, struct xe_vma *vma,
				 struct xe_vma_lock_and_validate_flags flags)
{
	struct xe_bo *bo = xe_vma_bo(vma);
	struct xe_vm *vm = xe_vma_vm(vma);
	int err = 0;

	if (bo) {
		if (!bo->vm)
			err = drm_exec_lock_obj(exec, &bo->ttm.base);

		/* Reject new mappings to DONTNEED/purged BOs; allow cleanup operations */
		if (!err && flags.check_purged) {
			if (xe_bo_madv_is_dontneed(bo))
				err = -EBUSY;  /* BO marked purgeable */
			else if (xe_bo_is_purged(bo))
				err = -EINVAL; /* BO already purged */
		}

		if (!err && flags.validate)
			err = xe_bo_validate(bo, vm,
					     xe_vm_allow_vm_eviction(vm) &&
					     flags.res_evict, exec);

		if (err)
			return err;

		if (flags.request_decompress)
			err = xe_bo_decompress(bo);
	}

	return err;
}

static int check_ufence(struct xe_vma *vma)
{
	if (vma->ufence) {
		struct xe_user_fence * const f = vma->ufence;

		if (!xe_sync_ufence_get_status(f))
			return -EBUSY;

		vma->ufence = NULL;
		xe_sync_ufence_put(f);
	}

	return 0;
}

static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_op *op)
{
	bool devmem_possible = IS_DGFX(vm->xe) && IS_ENABLED(CONFIG_DRM_XE_PAGEMAP);
	struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
	struct drm_pagemap *dpagemap = op->prefetch_range.dpagemap;
	int err = 0;

	struct xe_svm_range *svm_range;
	struct drm_gpusvm_ctx ctx = {};
	unsigned long i;

	if (!xe_vma_is_cpu_addr_mirror(vma))
		return 0;

	ctx.read_only = xe_vma_read_only(vma);
	ctx.devmem_possible = devmem_possible;
	ctx.check_pages_threshold = devmem_possible ? SZ_64K : 0;
	ctx.device_private_page_owner = xe_svm_private_page_owner(vm, !dpagemap);

	/* TODO: Threading the migration */
	xa_for_each(&op->prefetch_range.range, i, svm_range) {
		if (!dpagemap)
			xe_svm_range_migrate_to_smem(vm, svm_range);

		if (IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)) {
			drm_dbg(&vm->xe->drm,
				"Prefetch pagemap is %s start 0x%016lx end 0x%016lx\n",
				dpagemap ? dpagemap->drm->unique : "system",
				xe_svm_range_start(svm_range), xe_svm_range_end(svm_range));
		}

		if (xe_svm_range_needs_migrate_to_vram(svm_range, vma, dpagemap)) {
			err = xe_svm_alloc_vram(svm_range, &ctx, dpagemap);
			if (err) {
				drm_dbg(&vm->xe->drm, "VRAM allocation failed, retry from userspace, asid=%u, gpusvm=%p, errno=%pe\n",
					vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
				return -ENODATA;
			}
			xe_svm_range_debug(svm_range, "PREFETCH - RANGE MIGRATED TO VRAM");
		}

		err = xe_svm_range_get_pages(vm, svm_range, &ctx);
		if (err) {
			drm_dbg(&vm->xe->drm, "Get pages failed, asid=%u, gpusvm=%p, errno=%pe\n",
				vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
			if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM)
				err = -ENODATA;
			return err;
		}
		xe_svm_range_debug(svm_range, "PREFETCH - RANGE GET PAGES DONE");
	}

	return err;
}

static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm,
			    struct xe_vma_ops *vops, struct xe_vma_op *op)
{
	int err = 0;
	bool res_evict;

	/*
	 * We only allow evicting a BO within the VM if it is not part of an
	 * array of binds, as an array of binds can evict another BO within the
	 * bind.
	 */
	res_evict = !(vops->flags & XE_VMA_OPS_ARRAY_OF_BINDS);

	switch (op->base.op) {
	case DRM_GPUVA_OP_MAP:
		if (!op->map.invalidate_on_bind)
			err = vma_lock_and_validate(exec, op->map.vma,
						    (struct xe_vma_lock_and_validate_flags) {
							.res_evict = res_evict,
							.validate = !xe_vm_in_fault_mode(vm) ||
								    op->map.immediate,
							.request_decompress =
							op->map.request_decompress,
							.check_purged = true,
						    });
		break;
	case DRM_GPUVA_OP_REMAP:
		err = check_ufence(gpuva_to_vma(op->base.remap.unmap->va));
		if (err)
			break;

		err = vma_lock_and_validate(exec,
					    gpuva_to_vma(op->base.remap.unmap->va),
					    (struct xe_vma_lock_and_validate_flags) {
						    .res_evict = res_evict,
						    .validate = false,
						    .request_decompress = false,
						    .check_purged = false,
					    });
		if (!err && op->remap.prev)
			err = vma_lock_and_validate(exec, op->remap.prev,
						    (struct xe_vma_lock_and_validate_flags) {
							    .res_evict = res_evict,
							    .validate = true,
							    .request_decompress = false,
							    .check_purged = true,
						    });
		if (!err && op->remap.next)
			err = vma_lock_and_validate(exec, op->remap.next,
						    (struct xe_vma_lock_and_validate_flags) {
							    .res_evict = res_evict,
							    .validate = true,
							    .request_decompress = false,
							    .check_purged = true,
						    });
		break;
	case DRM_GPUVA_OP_UNMAP:
		err = check_ufence(gpuva_to_vma(op->base.unmap.va));
		if (err)
			break;

		err = vma_lock_and_validate(exec,
					    gpuva_to_vma(op->base.unmap.va),
					    (struct xe_vma_lock_and_validate_flags) {
						    .res_evict = res_evict,
						    .validate = false,
						    .request_decompress = false,
						    .check_purged = false,
					    });
		break;
	case DRM_GPUVA_OP_PREFETCH:
	{
		struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
		u32 region;

		if (!xe_vma_is_cpu_addr_mirror(vma)) {
			region = op->prefetch.region;
			xe_assert(vm->xe, region == DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC ||
				  region <= ARRAY_SIZE(region_to_mem_type));
		}

		/*
		 * Prefetch attempts to migrate BO's backing store without
		 * repopulating it first. Purged BOs have no backing store
		 * to migrate, so reject the operation.
		 */
		err = vma_lock_and_validate(exec,
					    gpuva_to_vma(op->base.prefetch.va),
					    (struct xe_vma_lock_and_validate_flags) {
						    .res_evict = res_evict,
						    .validate = false,
						    .request_decompress = false,
						    .check_purged = true,
					    });
		if (!err && !xe_vma_has_no_bo(vma))
			err = xe_bo_migrate(xe_vma_bo(vma),
					    region_to_mem_type[region],
					    NULL,
					    exec);
		break;
	}
	default:
		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
	}

	return err;
}

static int vm_bind_ioctl_ops_prefetch_ranges(struct xe_vm *vm, struct xe_vma_ops *vops)
{
	struct xe_vma_op *op;
	int err;

	if (!(vops->flags & XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH))
		return 0;

	list_for_each_entry(op, &vops->list, link) {
		if (op->base.op  == DRM_GPUVA_OP_PREFETCH) {
			err = prefetch_ranges(vm, op);
			if (err)
				return err;
		}
	}

	return 0;
}

static int vm_bind_ioctl_ops_lock_and_prep(struct drm_exec *exec,
					   struct xe_vm *vm,
					   struct xe_vma_ops *vops)
{
	struct xe_vma_op *op;
	int err;

	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
	if (err)
		return err;

	list_for_each_entry(op, &vops->list, link) {
		err = op_lock_and_prep(exec, vm, vops, op);
		if (err)
			return err;
	}

#ifdef TEST_VM_OPS_ERROR
	if (vops->inject_error &&
	    vm->xe->vm_inject_error_position == FORCE_OP_ERROR_LOCK)
		return -ENOSPC;
#endif

	return 0;
}

static void op_trace(struct xe_vma_op *op)
{
	switch (op->base.op) {
	case DRM_GPUVA_OP_MAP:
		trace_xe_vma_bind(op->map.vma);
		break;
	case DRM_GPUVA_OP_REMAP:
		trace_xe_vma_unbind(gpuva_to_vma(op->base.remap.unmap->va));
		if (op->remap.prev)
			trace_xe_vma_bind(op->remap.prev);
		if (op->remap.next)
			trace_xe_vma_bind(op->remap.next);
		break;
	case DRM_GPUVA_OP_UNMAP:
		trace_xe_vma_unbind(gpuva_to_vma(op->base.unmap.va));
		break;
	case DRM_GPUVA_OP_PREFETCH:
		trace_xe_vma_bind(gpuva_to_vma(op->base.prefetch.va));
		break;
	case DRM_GPUVA_OP_DRIVER:
		break;
	default:
		XE_WARN_ON("NOT POSSIBLE");
	}
}

static void trace_xe_vm_ops_execute(struct xe_vma_ops *vops)
{
	struct xe_vma_op *op;

	list_for_each_entry(op, &vops->list, link)
		op_trace(op);
}

static int vm_ops_setup_tile_args(struct xe_vm *vm, struct xe_vma_ops *vops)
{
	struct xe_exec_queue *q = vops->q;
	struct xe_tile *tile;
	int number_tiles = 0;
	u8 id;

	for_each_tile(tile, vm->xe, id) {
		if (vops->pt_update_ops[id].num_ops)
			++number_tiles;

		if (vops->pt_update_ops[id].q)
			continue;

		if (q) {
			vops->pt_update_ops[id].q = q;
			if (vm->pt_root[id] && !list_empty(&q->multi_gt_list))
				q = list_next_entry(q, multi_gt_list);
		} else {
			vops->pt_update_ops[id].q = vm->q[id];
		}
	}

	return number_tiles;
}

static struct dma_fence *ops_execute(struct xe_vm *vm,
				     struct xe_vma_ops *vops)
{
	struct xe_tile *tile;
	struct dma_fence *fence = NULL;
	struct dma_fence **fences = NULL;
	struct dma_fence_array *cf = NULL;
	int number_tiles = 0, current_fence = 0, n_fence = 0, err, i;
	u8 id;

	number_tiles = vm_ops_setup_tile_args(vm, vops);
	if (number_tiles == 0)
		return ERR_PTR(-ENODATA);

	for_each_tile(tile, vm->xe, id) {
		++n_fence;

		if (!(vops->flags & XE_VMA_OPS_FLAG_SKIP_TLB_WAIT))
			for_each_tlb_inval(i)
				++n_fence;
	}

	fences = kmalloc_objs(*fences, n_fence);
	if (!fences) {
		fence = ERR_PTR(-ENOMEM);
		goto err_trace;
	}

	cf = dma_fence_array_alloc(n_fence);
	if (!cf) {
		fence = ERR_PTR(-ENOMEM);
		goto err_out;
	}

	for_each_tile(tile, vm->xe, id) {
		if (!vops->pt_update_ops[id].num_ops)
			continue;

		err = xe_pt_update_ops_prepare(tile, vops);
		if (err) {
			fence = ERR_PTR(err);
			goto err_out;
		}
	}

	trace_xe_vm_ops_execute(vops);

	for_each_tile(tile, vm->xe, id) {
		struct xe_exec_queue *q = vops->pt_update_ops[tile->id].q;

		fence = NULL;
		if (!vops->pt_update_ops[id].num_ops)
			goto collect_fences;

		fence = xe_pt_update_ops_run(tile, vops);
		if (IS_ERR(fence))
			goto err_out;

collect_fences:
		fences[current_fence++] = fence ?: dma_fence_get_stub();
		if (vops->flags & XE_VMA_OPS_FLAG_SKIP_TLB_WAIT)
			continue;

		xe_migrate_job_lock(tile->migrate, q);
		for_each_tlb_inval(i)
			fences[current_fence++] =
				xe_exec_queue_tlb_inval_last_fence_get(q, vm, i);
		xe_migrate_job_unlock(tile->migrate, q);
	}

	xe_assert(vm->xe, current_fence == n_fence);
	dma_fence_array_init(cf, n_fence, fences, dma_fence_context_alloc(1),
			     1, false);
	fence = &cf->base;

	for_each_tile(tile, vm->xe, id) {
		if (!vops->pt_update_ops[id].num_ops)
			continue;

		xe_pt_update_ops_fini(tile, vops);
	}

	return fence;

err_out:
	for_each_tile(tile, vm->xe, id) {
		if (!vops->pt_update_ops[id].num_ops)
			continue;

		xe_pt_update_ops_abort(tile, vops);
	}
	while (current_fence)
		dma_fence_put(fences[--current_fence]);
	kfree(fences);
	kfree(cf);

err_trace:
	trace_xe_vm_ops_fail(vm);
	return fence;
}

static void vma_add_ufence(struct xe_vma *vma, struct xe_user_fence *ufence)
{
	if (vma->ufence)
		xe_sync_ufence_put(vma->ufence);
	vma->ufence = __xe_sync_ufence_get(ufence);
}

static void op_add_ufence(struct xe_vm *vm, struct xe_vma_op *op,
			  struct xe_user_fence *ufence)
{
	switch (op->base.op) {
	case DRM_GPUVA_OP_MAP:
		if (!xe_vma_is_cpu_addr_mirror(op->map.vma))
			vma_add_ufence(op->map.vma, ufence);
		break;
	case DRM_GPUVA_OP_REMAP:
		if (op->remap.prev)
			vma_add_ufence(op->remap.prev, ufence);
		if (op->remap.next)
			vma_add_ufence(op->remap.next, ufence);
		break;
	case DRM_GPUVA_OP_UNMAP:
		break;
	case DRM_GPUVA_OP_PREFETCH:
		vma_add_ufence(gpuva_to_vma(op->base.prefetch.va), ufence);
		break;
	default:
		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
	}
}

static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
				   struct dma_fence *fence)
{
	struct xe_user_fence *ufence;
	struct xe_vma_op *op;
	int i;

	ufence = find_ufence_get(vops->syncs, vops->num_syncs);
	list_for_each_entry(op, &vops->list, link) {
		if (ufence)
			op_add_ufence(vm, op, ufence);

		if (op->base.op == DRM_GPUVA_OP_UNMAP)
			xe_vma_destroy(gpuva_to_vma(op->base.unmap.va), fence);
		else if (op->base.op == DRM_GPUVA_OP_REMAP)
			xe_vma_destroy(gpuva_to_vma(op->base.remap.unmap->va),
				       fence);
	}
	if (ufence)
		xe_sync_ufence_put(ufence);
	if (fence) {
		for (i = 0; i < vops->num_syncs; i++)
			xe_sync_entry_signal(vops->syncs + i, fence);
	}
}

static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
						   struct xe_vma_ops *vops)
{
	struct xe_validation_ctx ctx;
	struct drm_exec exec;
	struct dma_fence *fence;
	int err = 0;

	lockdep_assert_held_write(&vm->lock);

	xe_validation_guard(&ctx, &vm->xe->val, &exec,
			    ((struct xe_val_flags) {
				    .interruptible = true,
				    .exec_ignore_duplicates = true,
			    }), err) {
		err = vm_bind_ioctl_ops_lock_and_prep(&exec, vm, vops);
		drm_exec_retry_on_contention(&exec);
		xe_validation_retry_on_oom(&ctx, &err);
		if (err)
			return ERR_PTR(err);

		xe_vm_set_validation_exec(vm, &exec);
		fence = ops_execute(vm, vops);
		xe_vm_set_validation_exec(vm, NULL);
		if (IS_ERR(fence)) {
			if (PTR_ERR(fence) == -ENODATA)
				vm_bind_ioctl_ops_fini(vm, vops, NULL);
			return fence;
		}

		vm_bind_ioctl_ops_fini(vm, vops, fence);
	}

	return err ? ERR_PTR(err) : fence;
}
ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);

#define SUPPORTED_FLAGS_STUB  \
	(DRM_XE_VM_BIND_FLAG_READONLY | \
	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
	 DRM_XE_VM_BIND_FLAG_NULL | \
	 DRM_XE_VM_BIND_FLAG_DUMPABLE | \
	 DRM_XE_VM_BIND_FLAG_CHECK_PXP | \
	 DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR | \
	 DRM_XE_VM_BIND_FLAG_MADVISE_AUTORESET | \
	 DRM_XE_VM_BIND_FLAG_DECOMPRESS)

#ifdef TEST_VM_OPS_ERROR
#define SUPPORTED_FLAGS	(SUPPORTED_FLAGS_STUB | FORCE_OP_ERROR)
#else
#define SUPPORTED_FLAGS	SUPPORTED_FLAGS_STUB
#endif

#define XE_64K_PAGE_MASK 0xffffull
#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)

static int vm_bind_ioctl_check_args(struct xe_device *xe, struct xe_vm *vm,
				    struct drm_xe_vm_bind *args,
				    struct drm_xe_vm_bind_op **bind_ops)
{
	int err;
	int i;

	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
		return -EINVAL;

	if (XE_IOCTL_DBG(xe, args->extensions))
		return -EINVAL;

	if (XE_IOCTL_DBG(xe, args->num_syncs > DRM_XE_MAX_SYNCS))
		return -EINVAL;

	if (args->num_binds > 1) {
		u64 __user *bind_user =
			u64_to_user_ptr(args->vector_of_binds);

		*bind_ops = kvmalloc_objs(struct drm_xe_vm_bind_op,
					  args->num_binds,
					  GFP_KERNEL | __GFP_ACCOUNT | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
		if (!*bind_ops)
			return args->num_binds > 1 ? -ENOBUFS : -ENOMEM;

		err = copy_from_user(*bind_ops, bind_user,
				     sizeof(struct drm_xe_vm_bind_op) *
				     args->num_binds);
		if (XE_IOCTL_DBG(xe, err)) {
			err = -EFAULT;
			goto free_bind_ops;
		}
	} else {
		*bind_ops = &args->bind;
	}

	for (i = 0; i < args->num_binds; ++i) {
		u64 range = (*bind_ops)[i].range;
		u64 addr = (*bind_ops)[i].addr;
		u32 op = (*bind_ops)[i].op;
		u32 flags = (*bind_ops)[i].flags;
		u32 obj = (*bind_ops)[i].obj;
		u64 obj_offset = (*bind_ops)[i].obj_offset;
		u32 prefetch_region = (*bind_ops)[i].prefetch_mem_region_instance;
		bool is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
		bool is_cpu_addr_mirror = flags &
			DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR;
		bool is_decompress = flags & DRM_XE_VM_BIND_FLAG_DECOMPRESS;
		u16 pat_index = (*bind_ops)[i].pat_index;
		u16 coh_mode;
		bool comp_en;

		if (XE_IOCTL_DBG(xe, is_cpu_addr_mirror &&
				 (!xe_vm_in_fault_mode(vm) ||
				 !IS_ENABLED(CONFIG_DRM_XE_GPUSVM)))) {
			err = -EINVAL;
			goto free_bind_ops;
		}

		if (XE_IOCTL_DBG(xe, pat_index >= xe->pat.n_entries)) {
			err = -EINVAL;
			goto free_bind_ops;
		}

		pat_index = array_index_nospec(pat_index, xe->pat.n_entries);
		(*bind_ops)[i].pat_index = pat_index;
		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
		comp_en = xe_pat_index_get_comp_en(xe, pat_index);
		if (XE_IOCTL_DBG(xe, !coh_mode)) { /* hw reserved */
			err = -EINVAL;
			goto free_bind_ops;
		}

		if (XE_WARN_ON(coh_mode > XE_COH_2WAY)) {
			err = -EINVAL;
			goto free_bind_ops;
		}

		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
		    XE_IOCTL_DBG(xe, obj && (is_null || is_cpu_addr_mirror)) ||
		    XE_IOCTL_DBG(xe, obj_offset && (is_null ||
						    is_cpu_addr_mirror)) ||
		    XE_IOCTL_DBG(xe, op != DRM_XE_VM_BIND_OP_MAP &&
				 (is_decompress || is_null || is_cpu_addr_mirror)) ||
		    XE_IOCTL_DBG(xe, is_decompress &&
				 xe_pat_index_get_comp_en(xe, pat_index)) ||
		    XE_IOCTL_DBG(xe, !obj &&
				 op == DRM_XE_VM_BIND_OP_MAP &&
				 !is_null && !is_cpu_addr_mirror) ||
		    XE_IOCTL_DBG(xe, !obj &&
				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
		    XE_IOCTL_DBG(xe, addr &&
				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
		    XE_IOCTL_DBG(xe, range &&
				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
		    XE_IOCTL_DBG(xe, obj &&
				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
		    XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
		    XE_IOCTL_DBG(xe, xe_device_is_l2_flush_optimized(xe) &&
				 (op == DRM_XE_VM_BIND_OP_MAP_USERPTR ||
				  is_cpu_addr_mirror) &&
				 (pat_index != 19 && coh_mode != XE_COH_2WAY)) ||
		    XE_IOCTL_DBG(xe, comp_en &&
				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
		    XE_IOCTL_DBG(xe, op == DRM_XE_VM_BIND_OP_MAP_USERPTR &&
				 !IS_ENABLED(CONFIG_DRM_GPUSVM)) ||
		    XE_IOCTL_DBG(xe, obj &&
				 op == DRM_XE_VM_BIND_OP_PREFETCH) ||
		    XE_IOCTL_DBG(xe, prefetch_region &&
				 op != DRM_XE_VM_BIND_OP_PREFETCH) ||
		    XE_IOCTL_DBG(xe, (prefetch_region != DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC &&
				      /* Guard against undefined shift in BIT(prefetch_region) */
				      (prefetch_region >= (sizeof(xe->info.mem_region_mask) * 8) ||
				      !(BIT(prefetch_region) & xe->info.mem_region_mask)))) ||
		    XE_IOCTL_DBG(xe, obj &&
				 op == DRM_XE_VM_BIND_OP_UNMAP) ||
		    XE_IOCTL_DBG(xe, (flags & DRM_XE_VM_BIND_FLAG_MADVISE_AUTORESET) &&
				 (!is_cpu_addr_mirror || op != DRM_XE_VM_BIND_OP_MAP))) {
			err = -EINVAL;
			goto free_bind_ops;
		}

		if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
		    XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
		    XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
		    XE_IOCTL_DBG(xe, !range &&
				 op != DRM_XE_VM_BIND_OP_UNMAP_ALL)) {
			err = -EINVAL;
			goto free_bind_ops;
		}

		if (is_decompress && (XE_IOCTL_DBG(xe, !xe_device_has_flat_ccs(xe)) ||
				      XE_IOCTL_DBG(xe, GRAPHICS_VER(xe) < 20) ||
				      XE_IOCTL_DBG(xe, !IS_DGFX(xe)))) {
			err = -EOPNOTSUPP;
			goto free_bind_ops;
		}
	}

	return 0;

free_bind_ops:
	if (args->num_binds > 1)
		kvfree(*bind_ops);
	*bind_ops = NULL;
	return err;
}

static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
				       struct xe_exec_queue *q,
				       struct xe_sync_entry *syncs,
				       int num_syncs)
{
	struct dma_fence *fence = NULL;
	int i, err = 0;

	if (num_syncs) {
		fence = xe_sync_in_fence_get(syncs, num_syncs,
					     to_wait_exec_queue(vm, q), vm);
		if (IS_ERR(fence))
			return PTR_ERR(fence);

		for (i = 0; i < num_syncs; i++)
			xe_sync_entry_signal(&syncs[i], fence);
	}

	dma_fence_put(fence);

	return err;
}

static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
			    struct xe_exec_queue *q,
			    struct xe_sync_entry *syncs, u32 num_syncs)
{
	memset(vops, 0, sizeof(*vops));
	INIT_LIST_HEAD(&vops->list);
	vops->vm = vm;
	vops->q = q;
	vops->syncs = syncs;
	vops->num_syncs = num_syncs;
	vops->flags = 0;
}

static int xe_vm_bind_ioctl_validate_bo(struct xe_device *xe, struct xe_bo *bo,
					u64 addr, u64 range, u64 obj_offset,
					u16 pat_index, u32 op, u32 bind_flags)
{
	u16 coh_mode;
	bool comp_en;

	if (XE_IOCTL_DBG(xe, (bo->flags & XE_BO_FLAG_NO_COMPRESSION) &&
			 xe_pat_index_get_comp_en(xe, pat_index)))
		return -EINVAL;

	if (XE_IOCTL_DBG(xe, range > xe_bo_size(bo)) ||
	    XE_IOCTL_DBG(xe, obj_offset >
			 xe_bo_size(bo) - range)) {
		return -EINVAL;
	}

	/*
	 * Some platforms require 64k VM_BIND alignment,
	 * specifically those with XE_VRAM_FLAGS_NEED64K.
	 *
	 * Other platforms may have BO's set to 64k physical placement,
	 * but can be mapped at 4k offsets anyway. This check is only
	 * there for the former case.
	 */
	if ((bo->flags & XE_BO_FLAG_INTERNAL_64K) &&
	    (xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)) {
		if (XE_IOCTL_DBG(xe, obj_offset &
				 XE_64K_PAGE_MASK) ||
		    XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
		    XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
			return -EINVAL;
		}
	}

	coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
	if (bo->cpu_caching) {
		if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
				 bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)) {
			return -EINVAL;
		}
	} else if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE)) {
		/*
		 * Imported dma-buf from a different device should
		 * require 1way or 2way coherency since we don't know
		 * how it was mapped on the CPU. Just assume is it
		 * potentially cached on CPU side.
		 */
		return -EINVAL;
	}

	/*
	 * Ensures that imported buffer objects (dma-bufs) are not mapped
	 * with a PAT index that enables compression.
	 */
	comp_en = xe_pat_index_get_comp_en(xe, pat_index);
	if (XE_IOCTL_DBG(xe, bo->ttm.base.import_attach && comp_en))
		return -EINVAL;

	if (XE_IOCTL_DBG(xe, bo->ttm.base.import_attach && xe_device_is_l2_flush_optimized(xe) &&
			 (pat_index != 19 && coh_mode != XE_COH_2WAY)))
		return -EINVAL;

	/* If a BO is protected it can only be mapped if the key is still valid */
	if ((bind_flags & DRM_XE_VM_BIND_FLAG_CHECK_PXP) && xe_bo_is_protected(bo) &&
	    op != DRM_XE_VM_BIND_OP_UNMAP && op != DRM_XE_VM_BIND_OP_UNMAP_ALL)
		if (XE_IOCTL_DBG(xe, xe_pxp_bo_key_check(xe->pxp, bo) != 0))
			return -ENOEXEC;

	return 0;
}

int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
{
	struct xe_device *xe = to_xe_device(dev);
	struct xe_file *xef = to_xe_file(file);
	struct drm_xe_vm_bind *args = data;
	struct drm_xe_sync __user *syncs_user;
	struct xe_bo **bos = NULL;
	struct drm_gpuva_ops **ops = NULL;
	struct xe_vm *vm;
	struct xe_exec_queue *q = NULL;
	u32 num_syncs, num_ufence = 0;
	struct xe_sync_entry *syncs = NULL;
	struct drm_xe_vm_bind_op *bind_ops = NULL;
	struct xe_vma_ops vops;
	struct dma_fence *fence;
	int err;
	int i;

	vm = xe_vm_lookup(xef, args->vm_id);
	if (XE_IOCTL_DBG(xe, !vm))
		return -EINVAL;

	err = vm_bind_ioctl_check_args(xe, vm, args, &bind_ops);
	if (err)
		goto put_vm;

	if (args->exec_queue_id) {
		q = xe_exec_queue_lookup(xef, args->exec_queue_id);
		if (XE_IOCTL_DBG(xe, !q)) {
			err = -ENOENT;
			goto free_bind_ops;
		}

		if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
			err = -EINVAL;
			goto put_exec_queue;
		}
	}

	if (XE_IOCTL_DBG(xe, q && vm != q->user_vm)) {
		err = -EINVAL;
		goto put_exec_queue;
	}

	/* Ensure all UNMAPs visible */
	xe_svm_flush(vm);

	err = down_write_killable(&vm->lock);
	if (err)
		goto put_exec_queue;

	if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
		err = -ENOENT;
		goto release_vm_lock;
	}

	for (i = 0; i < args->num_binds; ++i) {
		u64 range = bind_ops[i].range;
		u64 addr = bind_ops[i].addr;

		if (XE_IOCTL_DBG(xe, range > vm->size) ||
		    XE_IOCTL_DBG(xe, addr > vm->size - range)) {
			err = -EINVAL;
			goto release_vm_lock;
		}
	}

	if (args->num_binds) {
		bos = kvzalloc_objs(*bos, args->num_binds,
				    GFP_KERNEL | __GFP_ACCOUNT | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
		if (!bos) {
			err = -ENOMEM;
			goto release_vm_lock;
		}

		ops = kvzalloc_objs(*ops, args->num_binds,
				    GFP_KERNEL | __GFP_ACCOUNT | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
		if (!ops) {
			err = -ENOMEM;
			goto free_bos;
		}
	}

	for (i = 0; i < args->num_binds; ++i) {
		struct drm_gem_object *gem_obj;
		u64 range = bind_ops[i].range;
		u64 addr = bind_ops[i].addr;
		u32 obj = bind_ops[i].obj;
		u64 obj_offset = bind_ops[i].obj_offset;
		u16 pat_index = bind_ops[i].pat_index;
		u32 op = bind_ops[i].op;
		u32 bind_flags = bind_ops[i].flags;

		if (!obj)
			continue;

		gem_obj = drm_gem_object_lookup(file, obj);
		if (XE_IOCTL_DBG(xe, !gem_obj)) {
			err = -ENOENT;
			goto put_obj;
		}
		bos[i] = gem_to_xe_bo(gem_obj);

		err = xe_vm_bind_ioctl_validate_bo(xe, bos[i], addr, range,
						   obj_offset, pat_index, op,
						   bind_flags);
		if (err)
			goto put_obj;
	}

	if (args->num_syncs) {
		syncs = kzalloc_objs(*syncs, args->num_syncs);
		if (!syncs) {
			err = -ENOMEM;
			goto put_obj;
		}
	}

	syncs_user = u64_to_user_ptr(args->syncs);
	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
		struct xe_exec_queue *__q = q ?: vm->q[0];

		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
					  &syncs_user[num_syncs],
					  __q->ufence_syncobj,
					  ++__q->ufence_timeline_value,
					  (xe_vm_in_lr_mode(vm) ?
					   SYNC_PARSE_FLAG_LR_MODE : 0) |
					  (!args->num_binds ?
					   SYNC_PARSE_FLAG_DISALLOW_USER_FENCE : 0));
		if (err)
			goto free_syncs;

		if (xe_sync_is_ufence(&syncs[num_syncs]))
			num_ufence++;
	}

	if (XE_IOCTL_DBG(xe, num_ufence > 1)) {
		err = -EINVAL;
		goto free_syncs;
	}

	if (!args->num_binds) {
		err = -ENODATA;
		goto free_syncs;
	}

	xe_vma_ops_init(&vops, vm, q, syncs, num_syncs);
	if (args->num_binds > 1)
		vops.flags |= XE_VMA_OPS_ARRAY_OF_BINDS;
	for (i = 0; i < args->num_binds; ++i) {
		u64 range = bind_ops[i].range;
		u64 addr = bind_ops[i].addr;
		u32 op = bind_ops[i].op;
		u32 flags = bind_ops[i].flags;
		u64 obj_offset = bind_ops[i].obj_offset;
		u32 prefetch_region = bind_ops[i].prefetch_mem_region_instance;
		u16 pat_index = bind_ops[i].pat_index;

		ops[i] = vm_bind_ioctl_ops_create(vm, &vops, bos[i], obj_offset,
						  addr, range, op, flags,
						  prefetch_region, pat_index);
		if (IS_ERR(ops[i])) {
			err = PTR_ERR(ops[i]);
			ops[i] = NULL;
			goto unwind_ops;
		}

		err = vm_bind_ioctl_ops_parse(vm, ops[i], &vops);
		if (err)
			goto unwind_ops;

#ifdef TEST_VM_OPS_ERROR
		if (flags & FORCE_OP_ERROR) {
			vops.inject_error = true;
			vm->xe->vm_inject_error_position =
				(vm->xe->vm_inject_error_position + 1) %
				FORCE_OP_ERROR_COUNT;
		}
#endif
	}

	/* Nothing to do */
	if (list_empty(&vops.list)) {
		err = -ENODATA;
		goto unwind_ops;
	}

	err = xe_vma_ops_alloc(&vops, args->num_binds > 1);
	if (err)
		goto unwind_ops;

	err = vm_bind_ioctl_ops_prefetch_ranges(vm, &vops);
	if (err)
		goto unwind_ops;

	fence = vm_bind_ioctl_ops_execute(vm, &vops);
	if (IS_ERR(fence))
		err = PTR_ERR(fence);
	else
		dma_fence_put(fence);

unwind_ops:
	if (err && err != -ENODATA)
		vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
	xe_vma_ops_fini(&vops);
	for (i = args->num_binds - 1; i >= 0; --i)
		if (ops[i])
			drm_gpuva_ops_free(&vm->gpuvm, ops[i]);
free_syncs:
	if (err == -ENODATA)
		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
	while (num_syncs--)
		xe_sync_entry_cleanup(&syncs[num_syncs]);

	kfree(syncs);
put_obj:
	for (i = 0; i < args->num_binds; ++i)
		xe_bo_put(bos[i]);

	kvfree(ops);
free_bos:
	kvfree(bos);
release_vm_lock:
	up_write(&vm->lock);
put_exec_queue:
	if (q)
		xe_exec_queue_put(q);
free_bind_ops:
	if (args->num_binds > 1)
		kvfree(bind_ops);
put_vm:
	xe_vm_put(vm);
	return err;
}

/*
 * Map access type, fault type, and fault level from current bspec
 * specification to user spec abstraction.  The current mapping is
 * approximately 1-to-1, with access type being the only notable
 * exception as it carries additional data with respect to prefetch
 * status that needs to be masked out.
 */
static u8 xe_to_user_access_type(u8 access_type)
{
	return access_type & XE_PAGEFAULT_ACCESS_TYPE_MASK;
}

static u8 xe_to_user_fault_type(u8 fault_type)
{
	return fault_type;
}

static u8 xe_to_user_fault_level(u8 fault_level)
{
	return fault_level;
}

static int fill_faults(struct xe_vm *vm,
		       struct drm_xe_vm_get_property *args)
{
	struct xe_vm_fault __user *usr_ptr = u64_to_user_ptr(args->data);
	struct xe_vm_fault *fault_list, fault_entry = { 0 };
	struct xe_vm_fault_entry *entry;
	int ret = 0, i = 0, count, entry_size;

	entry_size = sizeof(struct xe_vm_fault);
	count = args->size / entry_size;

	fault_list = kcalloc(count, sizeof(struct xe_vm_fault), GFP_KERNEL);
	if (!fault_list)
		return -ENOMEM;

	spin_lock(&vm->faults.lock);
	list_for_each_entry(entry, &vm->faults.list, list) {
		if (i == count)
			break;

		fault_entry.address = xe_device_canonicalize_addr(vm->xe, entry->address);
		fault_entry.address_precision = entry->address_precision;

		fault_entry.access_type = xe_to_user_access_type(entry->access_type);
		fault_entry.fault_type = xe_to_user_fault_type(entry->fault_type);
		fault_entry.fault_level = xe_to_user_fault_level(entry->fault_level);

		memcpy(&fault_list[i], &fault_entry, entry_size);

		i++;
	}
	spin_unlock(&vm->faults.lock);

	ret = copy_to_user(usr_ptr, fault_list, args->size);

	kfree(fault_list);
	return ret ? -EFAULT : 0;
}

static int xe_vm_get_property_helper(struct xe_vm *vm,
				     struct drm_xe_vm_get_property *args)
{
	size_t size;

	switch (args->property) {
	case DRM_XE_VM_GET_PROPERTY_FAULTS:
		spin_lock(&vm->faults.lock);
		size = size_mul(sizeof(struct xe_vm_fault), vm->faults.len);
		spin_unlock(&vm->faults.lock);

		if (!args->size) {
			args->size = size;
			return 0;
		}

		/*
		 * Number of faults may increase between calls to
		 * xe_vm_get_property_ioctl, so just report the number of
		 * faults the user requests if it's less than or equal to
		 * the number of faults in the VM fault array.
		 *
		 * We should also at least assert that the args->size value
		 * is a multiple of the xe_vm_fault struct size.
		 */
		if (args->size > size || args->size % sizeof(struct xe_vm_fault))
			return -EINVAL;

		return fill_faults(vm, args);
	}
	return -EINVAL;
}

int xe_vm_get_property_ioctl(struct drm_device *drm, void *data,
			     struct drm_file *file)
{
	struct xe_device *xe = to_xe_device(drm);
	struct xe_file *xef = to_xe_file(file);
	struct drm_xe_vm_get_property *args = data;
	struct xe_vm *vm;
	int ret = 0;

	if (XE_IOCTL_DBG(xe, (args->reserved[0] || args->reserved[1] ||
			      args->reserved[2])))
		return -EINVAL;

	vm = xe_vm_lookup(xef, args->vm_id);
	if (XE_IOCTL_DBG(xe, !vm))
		return -ENOENT;

	ret = xe_vm_get_property_helper(vm, args);

	xe_vm_put(vm);
	return ret;
}

/**
 * xe_vm_bind_kernel_bo - bind a kernel BO to a VM
 * @vm: VM to bind the BO to
 * @bo: BO to bind
 * @q: exec queue to use for the bind (optional)
 * @addr: address at which to bind the BO
 * @cache_lvl: PAT cache level to use
 *
 * Execute a VM bind map operation on a kernel-owned BO to bind it into a
 * kernel-owned VM.
 *
 * Returns a dma_fence to track the binding completion if the job to do so was
 * successfully submitted, an error pointer otherwise.
 */
struct dma_fence *xe_vm_bind_kernel_bo(struct xe_vm *vm, struct xe_bo *bo,
				       struct xe_exec_queue *q, u64 addr,
				       enum xe_cache_level cache_lvl)
{
	struct xe_vma_ops vops;
	struct drm_gpuva_ops *ops = NULL;
	struct dma_fence *fence;
	int err;

	xe_bo_get(bo);
	xe_vm_get(vm);
	if (q)
		xe_exec_queue_get(q);

	down_write(&vm->lock);

	xe_vma_ops_init(&vops, vm, q, NULL, 0);

	ops = vm_bind_ioctl_ops_create(vm, &vops, bo, 0, addr, xe_bo_size(bo),
				       DRM_XE_VM_BIND_OP_MAP, 0, 0,
				       vm->xe->pat.idx[cache_lvl]);
	if (IS_ERR(ops)) {
		err = PTR_ERR(ops);
		goto release_vm_lock;
	}

	err = vm_bind_ioctl_ops_parse(vm, ops, &vops);
	if (err)
		goto release_vm_lock;

	xe_assert(vm->xe, !list_empty(&vops.list));

	err = xe_vma_ops_alloc(&vops, false);
	if (err)
		goto unwind_ops;

	fence = vm_bind_ioctl_ops_execute(vm, &vops);
	if (IS_ERR(fence))
		err = PTR_ERR(fence);

unwind_ops:
	if (err && err != -ENODATA)
		vm_bind_ioctl_ops_unwind(vm, &ops, 1);

	xe_vma_ops_fini(&vops);
	drm_gpuva_ops_free(&vm->gpuvm, ops);

release_vm_lock:
	up_write(&vm->lock);

	if (q)
		xe_exec_queue_put(q);
	xe_vm_put(vm);
	xe_bo_put(bo);

	if (err)
		fence = ERR_PTR(err);

	return fence;
}

/**
 * xe_vm_lock() - Lock the vm's dma_resv object
 * @vm: The struct xe_vm whose lock is to be locked
 * @intr: Whether to perform any wait interruptible
 *
 * Return: 0 on success, -EINTR if @intr is true and the wait for a
 * contended lock was interrupted. If @intr is false, the function
 * always returns 0.
 */
int xe_vm_lock(struct xe_vm *vm, bool intr)
{
	int ret;

	if (intr)
		ret = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
	else
		ret = dma_resv_lock(xe_vm_resv(vm), NULL);

	return ret;
}

/**
 * xe_vm_unlock() - Unlock the vm's dma_resv object
 * @vm: The struct xe_vm whose lock is to be released.
 *
 * Unlock a buffer object lock that was locked by xe_vm_lock().
 */
void xe_vm_unlock(struct xe_vm *vm)
{
	dma_resv_unlock(xe_vm_resv(vm));
}

/**
 * xe_vm_invalidate_vma_submit - Submit a job to invalidate GPU mappings for
 * VMA.
 * @vma: VMA to invalidate
 * @batch: TLB invalidation batch to populate; caller must later call
 *         xe_tlb_inval_batch_wait() on it to wait for completion
 *
 * Walks a list of page tables leaves which it memset the entries owned by this
 * VMA to zero, invalidates the TLBs, but doesn't block waiting for TLB flush
 * to complete, but instead populates @batch which can be waited on using
 * xe_tlb_inval_batch_wait().
 *
 * Returns 0 for success, negative error code otherwise.
 */
int xe_vm_invalidate_vma_submit(struct xe_vma *vma, struct xe_tlb_inval_batch *batch)
{
	struct xe_device *xe = xe_vma_vm(vma)->xe;
	struct xe_vm *vm = xe_vma_vm(vma);
	struct xe_tile *tile;
	u8 tile_mask = 0;
	int ret = 0;
	u8 id;

	xe_assert(xe, !xe_vma_is_null(vma));
	xe_assert(xe, !xe_vma_is_cpu_addr_mirror(vma));
	trace_xe_vma_invalidate(vma);

	vm_dbg(&vm->xe->drm,
	       "INVALIDATE: addr=0x%016llx, range=0x%016llx",
		xe_vma_start(vma), xe_vma_size(vma));

	/*
	 * Check that we don't race with page-table updates, tile_invalidated
	 * update is safe
	 */
	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
		if (xe_vma_is_userptr(vma)) {
			lockdep_assert(lockdep_is_held_type(&vm->svm.gpusvm.notifier_lock, 0) ||
				       (lockdep_is_held_type(&vm->svm.gpusvm.notifier_lock, 1) &&
					lockdep_is_held(&xe_vm_resv(vm)->lock.base)));

			WARN_ON_ONCE(!mmu_interval_check_retry
				     (&to_userptr_vma(vma)->userptr.notifier,
				      to_userptr_vma(vma)->userptr.pages.notifier_seq));
			WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(vm),
							     DMA_RESV_USAGE_BOOKKEEP));

		} else {
			xe_bo_assert_held(xe_vma_bo(vma));
		}
	}

	for_each_tile(tile, xe, id)
		if (xe_pt_zap_ptes(tile, vma))
			tile_mask |= BIT(id);

	xe_device_wmb(xe);

	ret = xe_tlb_inval_range_tilemask_submit(xe, xe_vma_vm(vma)->usm.asid,
						 xe_vma_start(vma), xe_vma_end(vma),
						 tile_mask, batch);

	/* WRITE_ONCE pairs with READ_ONCE in xe_vm_has_valid_gpu_mapping() */
	WRITE_ONCE(vma->tile_invalidated, vma->tile_mask);
	return ret;
}

/**
 * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
 * @vma: VMA to invalidate
 *
 * Walks a list of page tables leaves which it memset the entries owned by this
 * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
 * complete.
 *
 * Returns 0 for success, negative error code otherwise.
 */
int xe_vm_invalidate_vma(struct xe_vma *vma)
{
	struct xe_tlb_inval_batch batch;
	int ret;

	ret = xe_vm_invalidate_vma_submit(vma, &batch);
	if (ret)
		return ret;

	xe_tlb_inval_batch_wait(&batch);
	return ret;
}

int xe_vm_validate_protected(struct xe_vm *vm)
{
	struct drm_gpuva *gpuva;
	int err = 0;

	if (!vm)
		return -ENODEV;

	mutex_lock(&vm->snap_mutex);

	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
		struct xe_vma *vma = gpuva_to_vma(gpuva);
		struct xe_bo *bo = vma->gpuva.gem.obj ?
			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;

		if (!bo)
			continue;

		if (xe_bo_is_protected(bo)) {
			err = xe_pxp_bo_key_check(vm->xe->pxp, bo);
			if (err)
				break;
		}
	}

	mutex_unlock(&vm->snap_mutex);
	return err;
}

struct xe_vm_snapshot {
	int uapi_flags;
	unsigned long num_snaps;
	struct {
		u64 ofs, bo_ofs;
		unsigned long len;
#define XE_VM_SNAP_FLAG_USERPTR		BIT(0)
#define XE_VM_SNAP_FLAG_READ_ONLY	BIT(1)
#define XE_VM_SNAP_FLAG_IS_NULL		BIT(2)
		unsigned long flags;
		int uapi_mem_region;
		int pat_index;
		int cpu_caching;
		struct xe_bo *bo;
		void *data;
		struct mm_struct *mm;
	} snap[];
};

struct xe_vm_snapshot *xe_vm_snapshot_capture(struct xe_vm *vm)
{
	unsigned long num_snaps = 0, i;
	struct xe_vm_snapshot *snap = NULL;
	struct drm_gpuva *gpuva;

	if (!vm)
		return NULL;

	mutex_lock(&vm->snap_mutex);
	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
		if (gpuva->flags & XE_VMA_DUMPABLE)
			num_snaps++;
	}

	if (num_snaps)
		snap = kvzalloc(offsetof(struct xe_vm_snapshot, snap[num_snaps]), GFP_NOWAIT);
	if (!snap) {
		snap = num_snaps ? ERR_PTR(-ENOMEM) : ERR_PTR(-ENODEV);
		goto out_unlock;
	}

	if (vm->flags & XE_VM_FLAG_FAULT_MODE)
		snap->uapi_flags |= DRM_XE_VM_CREATE_FLAG_FAULT_MODE;
	if (vm->flags & XE_VM_FLAG_LR_MODE)
		snap->uapi_flags |= DRM_XE_VM_CREATE_FLAG_LR_MODE;
	if (vm->flags & XE_VM_FLAG_SCRATCH_PAGE)
		snap->uapi_flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE;

	snap->num_snaps = num_snaps;
	i = 0;
	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
		struct xe_vma *vma = gpuva_to_vma(gpuva);
		struct xe_bo *bo = vma->gpuva.gem.obj ?
			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;

		if (!(gpuva->flags & XE_VMA_DUMPABLE))
			continue;

		snap->snap[i].ofs = xe_vma_start(vma);
		snap->snap[i].len = xe_vma_size(vma);
		snap->snap[i].flags = xe_vma_read_only(vma) ?
			XE_VM_SNAP_FLAG_READ_ONLY : 0;
		snap->snap[i].pat_index = vma->attr.pat_index;
		if (bo) {
			snap->snap[i].cpu_caching = bo->cpu_caching;
			snap->snap[i].bo = xe_bo_get(bo);
			snap->snap[i].bo_ofs = xe_vma_bo_offset(vma);
			switch (bo->ttm.resource->mem_type) {
			case XE_PL_SYSTEM:
			case XE_PL_TT:
				snap->snap[i].uapi_mem_region = 0;
				break;
			case XE_PL_VRAM0:
				snap->snap[i].uapi_mem_region = 1;
				break;
			case XE_PL_VRAM1:
				snap->snap[i].uapi_mem_region = 2;
				break;
			}
		} else if (xe_vma_is_userptr(vma)) {
			struct mm_struct *mm =
				to_userptr_vma(vma)->userptr.notifier.mm;

			if (mmget_not_zero(mm))
				snap->snap[i].mm = mm;
			else
				snap->snap[i].data = ERR_PTR(-EFAULT);

			snap->snap[i].bo_ofs = xe_vma_userptr(vma);
			snap->snap[i].flags |= XE_VM_SNAP_FLAG_USERPTR;
			snap->snap[i].uapi_mem_region = 0;
		} else if (xe_vma_is_null(vma)) {
			snap->snap[i].flags |= XE_VM_SNAP_FLAG_IS_NULL;
			snap->snap[i].uapi_mem_region = -1;
		} else {
			snap->snap[i].data = ERR_PTR(-ENOENT);
			snap->snap[i].uapi_mem_region = -1;
		}
		i++;
	}

out_unlock:
	mutex_unlock(&vm->snap_mutex);
	return snap;
}

void xe_vm_snapshot_capture_delayed(struct xe_vm_snapshot *snap)
{
	if (IS_ERR_OR_NULL(snap))
		return;

	for (int i = 0; i < snap->num_snaps; i++) {
		struct xe_bo *bo = snap->snap[i].bo;
		int err;

		if (IS_ERR(snap->snap[i].data) ||
		    snap->snap[i].flags & XE_VM_SNAP_FLAG_IS_NULL)
			continue;

		snap->snap[i].data = kvmalloc(snap->snap[i].len, GFP_USER);
		if (!snap->snap[i].data) {
			snap->snap[i].data = ERR_PTR(-ENOMEM);
			goto cleanup_bo;
		}

		if (bo) {
			err = xe_bo_read(bo, snap->snap[i].bo_ofs,
					 snap->snap[i].data, snap->snap[i].len);
		} else {
			void __user *userptr = (void __user *)(size_t)snap->snap[i].bo_ofs;

			kthread_use_mm(snap->snap[i].mm);
			if (!copy_from_user(snap->snap[i].data, userptr, snap->snap[i].len))
				err = 0;
			else
				err = -EFAULT;
			kthread_unuse_mm(snap->snap[i].mm);

			mmput(snap->snap[i].mm);
			snap->snap[i].mm = NULL;
		}

		if (err) {
			kvfree(snap->snap[i].data);
			snap->snap[i].data = ERR_PTR(err);
		}

cleanup_bo:
		xe_bo_put(bo);
		snap->snap[i].bo = NULL;
	}
}

void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p)
{
	unsigned long i, j;

	if (IS_ERR_OR_NULL(snap)) {
		drm_printf(p, "[0].error: %li\n", PTR_ERR(snap));
		return;
	}

	drm_printf(p, "VM.uapi_flags: 0x%x\n", snap->uapi_flags);
	for (i = 0; i < snap->num_snaps; i++) {
		drm_printf(p, "[%llx].length: 0x%lx\n", snap->snap[i].ofs, snap->snap[i].len);

		drm_printf(p, "[%llx].properties: %s|%s|mem_region=0x%lx|pat_index=%d|cpu_caching=%d\n",
			   snap->snap[i].ofs,
			   snap->snap[i].flags & XE_VM_SNAP_FLAG_READ_ONLY ?
			   "read_only" : "read_write",
			   snap->snap[i].flags & XE_VM_SNAP_FLAG_IS_NULL ?
			   "null_sparse" :
			   snap->snap[i].flags & XE_VM_SNAP_FLAG_USERPTR ?
			   "userptr" : "bo",
			   snap->snap[i].uapi_mem_region == -1 ? 0 :
			   BIT(snap->snap[i].uapi_mem_region),
			   snap->snap[i].pat_index,
			   snap->snap[i].cpu_caching);

		if (IS_ERR(snap->snap[i].data)) {
			drm_printf(p, "[%llx].error: %li\n", snap->snap[i].ofs,
				   PTR_ERR(snap->snap[i].data));
			continue;
		}

		if (snap->snap[i].flags & XE_VM_SNAP_FLAG_IS_NULL)
			continue;

		drm_printf(p, "[%llx].data: ", snap->snap[i].ofs);

		for (j = 0; j < snap->snap[i].len; j += sizeof(u32)) {
			u32 *val = snap->snap[i].data + j;
			char dumped[ASCII85_BUFSZ];

			drm_puts(p, ascii85_encode(*val, dumped));
		}

		drm_puts(p, "\n");

		if (drm_coredump_printer_is_full(p))
			return;
	}
}

void xe_vm_snapshot_free(struct xe_vm_snapshot *snap)
{
	unsigned long i;

	if (IS_ERR_OR_NULL(snap))
		return;

	for (i = 0; i < snap->num_snaps; i++) {
		if (!IS_ERR(snap->snap[i].data))
			kvfree(snap->snap[i].data);
		xe_bo_put(snap->snap[i].bo);
		if (snap->snap[i].mm)
			mmput(snap->snap[i].mm);
	}
	kvfree(snap);
}

/**
 * xe_vma_need_vram_for_atomic - Check if VMA needs VRAM migration for atomic operations
 * @xe: Pointer to the Xe device structure
 * @vma: Pointer to the virtual memory area (VMA) structure
 * @is_atomic: In pagefault path and atomic operation
 *
 * This function determines whether the given VMA needs to be migrated to
 * VRAM in order to do atomic GPU operation.
 *
 * Return:
 *   1        - Migration to VRAM is required
 *   0        - Migration is not required
 *   -EACCES  - Invalid access for atomic memory attr
 *
 */
int xe_vma_need_vram_for_atomic(struct xe_device *xe, struct xe_vma *vma, bool is_atomic)
{
	u32 atomic_access = xe_vma_bo(vma) ? xe_vma_bo(vma)->attr.atomic_access :
					     vma->attr.atomic_access;

	if (!IS_DGFX(xe) || !is_atomic)
		return false;

	/*
	 * NOTE: The checks implemented here are platform-specific. For
	 * instance, on a device supporting CXL atomics, these would ideally
	 * work universally without additional handling.
	 */
	switch (atomic_access) {
	case DRM_XE_ATOMIC_DEVICE:
		return !xe->info.has_device_atomics_on_smem;

	case DRM_XE_ATOMIC_CPU:
		return -EACCES;

	case DRM_XE_ATOMIC_UNDEFINED:
	case DRM_XE_ATOMIC_GLOBAL:
	default:
		return 1;
	}
}

static int xe_vm_alloc_vma(struct xe_vm *vm,
			   struct drm_gpuvm_map_req *map_req,
			   bool is_madvise)
{
	struct xe_vma_ops vops;
	struct drm_gpuva_ops *ops = NULL;
	struct drm_gpuva_op *__op;
	unsigned int vma_flags = 0;
	bool remap_op = false;
	struct xe_vma_mem_attr tmp_attr = {};
	u16 default_pat;
	int err;

	lockdep_assert_held_write(&vm->lock);

	if (is_madvise)
		ops = drm_gpuvm_madvise_ops_create(&vm->gpuvm, map_req);
	else
		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, map_req);

	if (IS_ERR(ops))
		return PTR_ERR(ops);

	if (list_empty(&ops->list)) {
		err = 0;
		goto free_ops;
	}

	drm_gpuva_for_each_op(__op, ops) {
		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
		struct xe_vma *vma = NULL;

		if (!is_madvise) {
			if (__op->op == DRM_GPUVA_OP_UNMAP) {
				vma = gpuva_to_vma(op->base.unmap.va);
				XE_WARN_ON(!xe_vma_has_default_mem_attrs(vma));
				default_pat = vma->attr.default_pat_index;
				vma_flags = vma->gpuva.flags;
			}

			if (__op->op == DRM_GPUVA_OP_REMAP) {
				vma = gpuva_to_vma(op->base.remap.unmap->va);
				default_pat = vma->attr.default_pat_index;
				vma_flags = vma->gpuva.flags;
			}

			if (__op->op == DRM_GPUVA_OP_MAP) {
				op->map.vma_flags |= vma_flags & XE_VMA_CREATE_MASK;
				op->map.pat_index = default_pat;
			}
		} else {
			if (__op->op == DRM_GPUVA_OP_REMAP) {
				vma = gpuva_to_vma(op->base.remap.unmap->va);
				xe_assert(vm->xe, !remap_op);
				xe_assert(vm->xe, xe_vma_has_no_bo(vma));
				remap_op = true;
				vma_flags = vma->gpuva.flags;
			}

			if (__op->op == DRM_GPUVA_OP_MAP) {
				xe_assert(vm->xe, remap_op);
				remap_op = false;
				/*
				 * In case of madvise ops DRM_GPUVA_OP_MAP is
				 * always after DRM_GPUVA_OP_REMAP, so ensure
				 * to propagate the flags from the vma we're
				 * unmapping.
				 */
				op->map.vma_flags |= vma_flags & XE_VMA_CREATE_MASK;
			}
		}
		print_op(vm->xe, __op);
	}

	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);

	if (is_madvise)
		vops.flags |= XE_VMA_OPS_FLAG_MADVISE;
	else
		vops.flags |= XE_VMA_OPS_FLAG_ALLOW_SVM_UNMAP;

	err = vm_bind_ioctl_ops_parse(vm, ops, &vops);
	if (err)
		goto unwind_ops;

	xe_vm_lock(vm, false);

	drm_gpuva_for_each_op(__op, ops) {
		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
		struct xe_vma *vma;

		if (__op->op == DRM_GPUVA_OP_UNMAP) {
			vma = gpuva_to_vma(op->base.unmap.va);
			/* There should be no unmap for madvise */
			if (is_madvise)
				XE_WARN_ON("UNEXPECTED UNMAP");

			xe_vma_destroy(vma, NULL);
		} else if (__op->op == DRM_GPUVA_OP_REMAP) {
			vma = gpuva_to_vma(op->base.remap.unmap->va);
			/* In case of madvise ops Store attributes for REMAP UNMAPPED
			 * VMA, so they can be assigned to newly MAP created vma.
			 */
			if (is_madvise)
				xe_vma_mem_attr_copy(&tmp_attr, &vma->attr);

			xe_vma_destroy(gpuva_to_vma(op->base.remap.unmap->va), NULL);
		} else if (__op->op == DRM_GPUVA_OP_MAP) {
			vma = op->map.vma;
			/* In case of madvise call, MAP will always be followed by REMAP.
			 * Therefore temp_attr will always have sane values, making it safe to
			 * copy them to new vma.
			 */
			if (is_madvise)
				xe_vma_mem_attr_copy(&vma->attr, &tmp_attr);
		}
	}

	xe_vm_unlock(vm);
	drm_gpuva_ops_free(&vm->gpuvm, ops);
	xe_vma_mem_attr_fini(&tmp_attr);
	return 0;

unwind_ops:
	vm_bind_ioctl_ops_unwind(vm, &ops, 1);
free_ops:
	drm_gpuva_ops_free(&vm->gpuvm, ops);
	return err;
}

/**
 * xe_vm_alloc_madvise_vma - Allocate VMA's with madvise ops
 * @vm: Pointer to the xe_vm structure
 * @start: Starting input address
 * @range: Size of the input range
 *
 * This function splits existing vma to create new vma for user provided input range
 *
 * Return: 0 if success
 */
int xe_vm_alloc_madvise_vma(struct xe_vm *vm, uint64_t start, uint64_t range)
{
	struct drm_gpuvm_map_req map_req = {
		.map.va.addr = start,
		.map.va.range = range,
	};

	lockdep_assert_held_write(&vm->lock);

	vm_dbg(&vm->xe->drm, "MADVISE_OPS_CREATE: addr=0x%016llx, size=0x%016llx", start, range);

	return xe_vm_alloc_vma(vm, &map_req, true);
}

static bool is_cpu_addr_vma_with_default_attr(struct xe_vma *vma)
{
	return vma && xe_vma_is_cpu_addr_mirror(vma) &&
	       xe_vma_has_default_mem_attrs(vma);
}

/**
 * xe_vm_find_cpu_addr_mirror_vma_range - Extend a VMA range to include adjacent CPU-mirrored VMAs
 * @vm: VM to search within
 * @start: Input/output pointer to the starting address of the range
 * @end: Input/output pointer to the end address of the range
 *
 * Given a range defined by @start and @range, this function checks the VMAs
 * immediately before and after the range. If those neighboring VMAs are
 * CPU-address-mirrored and have default memory attributes, the function
 * updates @start and @range to include them. This extended range can then
 * be used for merging or other operations that require a unified VMA.
 *
 * The function does not perform the merge itself; it only computes the
 * mergeable boundaries.
 */
void xe_vm_find_cpu_addr_mirror_vma_range(struct xe_vm *vm, u64 *start, u64 *end)
{
	struct xe_vma *prev, *next;

	lockdep_assert_held(&vm->lock);

	if (*start >= SZ_4K) {
		prev = xe_vm_find_vma_by_addr(vm, *start - SZ_4K);
		if (is_cpu_addr_vma_with_default_attr(prev))
			*start = xe_vma_start(prev);
	}

	if (*end < vm->size) {
		next = xe_vm_find_vma_by_addr(vm, *end + 1);
		if (is_cpu_addr_vma_with_default_attr(next))
			*end = xe_vma_end(next);
	}
}

/**
 * xe_vm_alloc_cpu_addr_mirror_vma - Allocate CPU addr mirror vma
 * @vm: Pointer to the xe_vm structure
 * @start: Starting input address
 * @range: Size of the input range
 *
 * This function splits/merges existing vma to create new vma for user provided input range
 *
 * Return: 0 if success
 */
int xe_vm_alloc_cpu_addr_mirror_vma(struct xe_vm *vm, uint64_t start, uint64_t range)
{
	struct drm_gpuvm_map_req map_req = {
		.map.va.addr = start,
		.map.va.range = range,
	};

	lockdep_assert_held_write(&vm->lock);

	vm_dbg(&vm->xe->drm, "CPU_ADDR_MIRROR_VMA_OPS_CREATE: addr=0x%016llx, size=0x%016llx",
	       start, range);

	return xe_vm_alloc_vma(vm, &map_req, false);
}

/**
 * xe_vm_add_exec_queue() - Add exec queue to VM
 * @vm: The VM.
 * @q: The exec_queue
 *
 * Add exec queue to VM, skipped if the device does not have context based TLB
 * invalidations.
 */
void xe_vm_add_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
{
	struct xe_device *xe = vm->xe;

	/* User VMs and queues only */
	xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_KERNEL));
	xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_PERMANENT));
	xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_VM));
	xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_MIGRATE));
	xe_assert(xe, vm->xef);
	xe_assert(xe, vm == q->vm);

	if (!xe->info.has_ctx_tlb_inval)
		return;

	down_write(&vm->exec_queues.lock);
	list_add(&q->vm_exec_queue_link, &vm->exec_queues.list[q->gt->info.id]);
	++vm->exec_queues.count[q->gt->info.id];
	up_write(&vm->exec_queues.lock);
}

/**
 * xe_vm_remove_exec_queue() - Remove exec queue from VM
 * @vm: The VM.
 * @q: The exec_queue
 *
 * Remove exec queue from VM, skipped if the device does not have context based
 * TLB invalidations.
 */
void xe_vm_remove_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
{
	if (!vm->xe->info.has_ctx_tlb_inval)
		return;

	down_write(&vm->exec_queues.lock);
	if (!list_empty(&q->vm_exec_queue_link)) {
		list_del(&q->vm_exec_queue_link);
		--vm->exec_queues.count[q->gt->info.id];
	}
	up_write(&vm->exec_queues.lock);
}