xref: /linux/drivers/gpu/drm/xe/xe_pt.c (revision dd08ebf6c3525a7ea2186e636df064ea47281987)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2022 Intel Corporation
4  */
5 
6 #include "xe_bo.h"
7 #include "xe_device.h"
8 #include "xe_gt.h"
9 #include "xe_migrate.h"
10 #include "xe_pt.h"
11 #include "xe_pt_types.h"
12 #include "xe_pt_walk.h"
13 #include "xe_vm.h"
14 #include "xe_res_cursor.h"
15 
16 struct xe_pt_dir {
17 	struct xe_pt pt;
18 	/** @dir: Directory structure for the xe_pt_walk functionality */
19 	struct xe_ptw_dir dir;
20 };
21 
22 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
23 #define xe_pt_set_addr(__xe_pt, __addr) ((__xe_pt)->addr = (__addr))
24 #define xe_pt_addr(__xe_pt) ((__xe_pt)->addr)
25 #else
26 #define xe_pt_set_addr(__xe_pt, __addr)
27 #define xe_pt_addr(__xe_pt) 0ull
28 #endif
29 
30 static const u64 xe_normal_pt_shifts[] = {12, 21, 30, 39, 48};
31 static const u64 xe_compact_pt_shifts[] = {16, 21, 30, 39, 48};
32 
33 #define XE_PT_HIGHEST_LEVEL (ARRAY_SIZE(xe_normal_pt_shifts) - 1)
34 
35 static struct xe_pt_dir *as_xe_pt_dir(struct xe_pt *pt)
36 {
37 	return container_of(pt, struct xe_pt_dir, pt);
38 }
39 
40 static struct xe_pt *xe_pt_entry(struct xe_pt_dir *pt_dir, unsigned int index)
41 {
42 	return container_of(pt_dir->dir.entries[index], struct xe_pt, base);
43 }
44 
45 /**
46  * gen8_pde_encode() - Encode a page-table directory entry pointing to
47  * another page-table.
48  * @bo: The page-table bo of the page-table to point to.
49  * @bo_offset: Offset in the page-table bo to point to.
50  * @level: The cache level indicating the caching of @bo.
51  *
52  * TODO: Rename.
53  *
54  * Return: An encoded page directory entry. No errors.
55  */
56 u64 gen8_pde_encode(struct xe_bo *bo, u64 bo_offset,
57 		    const enum xe_cache_level level)
58 {
59 	u64 pde;
60 	bool is_lmem;
61 
62 	pde = xe_bo_addr(bo, bo_offset, GEN8_PAGE_SIZE, &is_lmem);
63 	pde |= GEN8_PAGE_PRESENT | GEN8_PAGE_RW;
64 
65 	XE_WARN_ON(IS_DGFX(xe_bo_device(bo)) && !is_lmem);
66 
67 	/* FIXME: I don't think the PPAT handling is correct for MTL */
68 
69 	if (level != XE_CACHE_NONE)
70 		pde |= PPAT_CACHED_PDE;
71 	else
72 		pde |= PPAT_UNCACHED;
73 
74 	return pde;
75 }
76 
77 static dma_addr_t vma_addr(struct xe_vma *vma, u64 offset,
78 			   size_t page_size, bool *is_lmem)
79 {
80 	if (xe_vma_is_userptr(vma)) {
81 		struct xe_res_cursor cur;
82 		u64 page;
83 
84 		*is_lmem = false;
85 		page = offset >> PAGE_SHIFT;
86 		offset &= (PAGE_SIZE - 1);
87 
88 		xe_res_first_sg(vma->userptr.sg, page << PAGE_SHIFT, page_size,
89 				&cur);
90 		return xe_res_dma(&cur) + offset;
91 	} else {
92 		return xe_bo_addr(vma->bo, offset, page_size, is_lmem);
93 	}
94 }
95 
96 static u64 __gen8_pte_encode(u64 pte, enum xe_cache_level cache, u32 flags,
97 			     u32 pt_level)
98 {
99 	pte |= GEN8_PAGE_PRESENT | GEN8_PAGE_RW;
100 
101 	if (unlikely(flags & PTE_READ_ONLY))
102 		pte &= ~GEN8_PAGE_RW;
103 
104 	/* FIXME: I don't think the PPAT handling is correct for MTL */
105 
106 	switch (cache) {
107 	case XE_CACHE_NONE:
108 		pte |= PPAT_UNCACHED;
109 		break;
110 	case XE_CACHE_WT:
111 		pte |= PPAT_DISPLAY_ELLC;
112 		break;
113 	default:
114 		pte |= PPAT_CACHED;
115 		break;
116 	}
117 
118 	if (pt_level == 1)
119 		pte |= GEN8_PDE_PS_2M;
120 	else if (pt_level == 2)
121 		pte |= GEN8_PDPE_PS_1G;
122 
123 	/* XXX: Does hw support 1 GiB pages? */
124 	XE_BUG_ON(pt_level > 2);
125 
126 	return pte;
127 }
128 
129 /**
130  * gen8_pte_encode() - Encode a page-table entry pointing to memory.
131  * @vma: The vma representing the memory to point to.
132  * @bo: If @vma is NULL, representing the memory to point to.
133  * @offset: The offset into @vma or @bo.
134  * @cache: The cache level indicating
135  * @flags: Currently only supports PTE_READ_ONLY for read-only access.
136  * @pt_level: The page-table level of the page-table into which the entry
137  * is to be inserted.
138  *
139  * TODO: Rename.
140  *
141  * Return: An encoded page-table entry. No errors.
142  */
143 u64 gen8_pte_encode(struct xe_vma *vma, struct xe_bo *bo,
144 		    u64 offset, enum xe_cache_level cache,
145 		    u32 flags, u32 pt_level)
146 {
147 	u64 pte;
148 	bool is_vram;
149 
150 	if (vma)
151 		pte = vma_addr(vma, offset, GEN8_PAGE_SIZE, &is_vram);
152 	else
153 		pte = xe_bo_addr(bo, offset, GEN8_PAGE_SIZE, &is_vram);
154 
155 	if (is_vram) {
156 		pte |= GEN12_PPGTT_PTE_LM;
157 		if (vma && vma->use_atomic_access_pte_bit)
158 			pte |= GEN12_USM_PPGTT_PTE_AE;
159 	}
160 
161 	return __gen8_pte_encode(pte, cache, flags, pt_level);
162 }
163 
164 static u64 __xe_pt_empty_pte(struct xe_gt *gt, struct xe_vm *vm,
165 			     unsigned int level)
166 {
167 	u8 id = gt->info.id;
168 
169 	XE_BUG_ON(xe_gt_is_media_type(gt));
170 
171 	if (!vm->scratch_bo[id])
172 		return 0;
173 
174 	if (level == 0) {
175 		u64 empty = gen8_pte_encode(NULL, vm->scratch_bo[id], 0,
176 					    XE_CACHE_WB, 0, 0);
177 		if (vm->flags & XE_VM_FLAGS_64K)
178 			empty |= GEN12_PTE_PS64;
179 
180 		return empty;
181 	} else {
182 		return gen8_pde_encode(vm->scratch_pt[id][level - 1]->bo, 0,
183 				       XE_CACHE_WB);
184 	}
185 }
186 
187 /**
188  * xe_pt_create() - Create a page-table.
189  * @vm: The vm to create for.
190  * @gt: The gt to create for.
191  * @level: The page-table level.
192  *
193  * Allocate and initialize a single struct xe_pt metadata structure. Also
194  * create the corresponding page-table bo, but don't initialize it. If the
195  * level is grater than zero, then it's assumed to be a directory page-
196  * table and the directory structure is also allocated and initialized to
197  * NULL pointers.
198  *
199  * Return: A valid struct xe_pt pointer on success, Pointer error code on
200  * error.
201  */
202 struct xe_pt *xe_pt_create(struct xe_vm *vm, struct xe_gt *gt,
203 			   unsigned int level)
204 {
205 	struct xe_pt *pt;
206 	struct xe_bo *bo;
207 	size_t size;
208 	int err;
209 
210 	size = !level ?  sizeof(struct xe_pt) : sizeof(struct xe_pt_dir) +
211 		GEN8_PDES * sizeof(struct xe_ptw *);
212 	pt = kzalloc(size, GFP_KERNEL);
213 	if (!pt)
214 		return ERR_PTR(-ENOMEM);
215 
216 	bo = xe_bo_create_pin_map(vm->xe, gt, vm, SZ_4K,
217 				  ttm_bo_type_kernel,
218 				  XE_BO_CREATE_VRAM_IF_DGFX(gt) |
219 				  XE_BO_CREATE_IGNORE_MIN_PAGE_SIZE_BIT |
220 				  XE_BO_CREATE_PINNED_BIT);
221 	if (IS_ERR(bo)) {
222 		err = PTR_ERR(bo);
223 		goto err_kfree;
224 	}
225 	pt->bo = bo;
226 	pt->level = level;
227 	pt->base.dir = level ? &as_xe_pt_dir(pt)->dir : NULL;
228 
229 	XE_BUG_ON(level > XE_VM_MAX_LEVEL);
230 
231 	return pt;
232 
233 err_kfree:
234 	kfree(pt);
235 	return ERR_PTR(err);
236 }
237 
238 /**
239  * xe_pt_populate_empty() - Populate a page-table bo with scratch- or zero
240  * entries.
241  * @gt: The gt the scratch pagetable of which to use.
242  * @vm: The vm we populate for.
243  * @pt: The pagetable the bo of which to initialize.
244  *
245  * Populate the page-table bo of @pt with entries pointing into the gt's
246  * scratch page-table tree if any. Otherwise populate with zeros.
247  */
248 void xe_pt_populate_empty(struct xe_gt *gt, struct xe_vm *vm,
249 			  struct xe_pt *pt)
250 {
251 	struct iosys_map *map = &pt->bo->vmap;
252 	u64 empty;
253 	int i;
254 
255 	XE_BUG_ON(xe_gt_is_media_type(gt));
256 
257 	if (!vm->scratch_bo[gt->info.id]) {
258 		/*
259 		 * FIXME: Some memory is allocated already allocated to zero?
260 		 * Find out which memory that is and avoid this memset...
261 		 */
262 		xe_map_memset(vm->xe, map, 0, 0, SZ_4K);
263 	} else {
264 		empty = __xe_pt_empty_pte(gt, vm, pt->level);
265 		for (i = 0; i < GEN8_PDES; i++)
266 			xe_pt_write(vm->xe, map, i, empty);
267 	}
268 }
269 
270 /**
271  * xe_pt_shift() - Return the ilog2 value of the size of the address range of
272  * a page-table at a certain level.
273  * @level: The level.
274  *
275  * Return: The ilog2 value of the size of the address range of a page-table
276  * at level @level.
277  */
278 unsigned int xe_pt_shift(unsigned int level)
279 {
280 	return GEN8_PTE_SHIFT + GEN8_PDE_SHIFT * level;
281 }
282 
283 /**
284  * xe_pt_destroy() - Destroy a page-table tree.
285  * @pt: The root of the page-table tree to destroy.
286  * @flags: vm flags. Currently unused.
287  * @deferred: List head of lockless list for deferred putting. NULL for
288  *            immediate putting.
289  *
290  * Puts the page-table bo, recursively calls xe_pt_destroy on all children
291  * and finally frees @pt. TODO: Can we remove the @flags argument?
292  */
293 void xe_pt_destroy(struct xe_pt *pt, u32 flags, struct llist_head *deferred)
294 {
295 	int i;
296 
297 	if (!pt)
298 		return;
299 
300 	XE_BUG_ON(!list_empty(&pt->bo->vmas));
301 	xe_bo_unpin(pt->bo);
302 	xe_bo_put_deferred(pt->bo, deferred);
303 
304 	if (pt->level > 0 && pt->num_live) {
305 		struct xe_pt_dir *pt_dir = as_xe_pt_dir(pt);
306 
307 		for (i = 0; i < GEN8_PDES; i++) {
308 			if (xe_pt_entry(pt_dir, i))
309 				xe_pt_destroy(xe_pt_entry(pt_dir, i), flags,
310 					      deferred);
311 		}
312 	}
313 	kfree(pt);
314 }
315 
316 /**
317  * xe_pt_create_scratch() - Setup a scratch memory pagetable tree for the
318  * given gt and vm.
319  * @xe: xe device.
320  * @gt: gt to set up for.
321  * @vm: vm to set up for.
322  *
323  * Sets up a pagetable tree with one page-table per level and a single
324  * leaf bo. All pagetable entries point to the single page-table or,
325  * for L0, the single bo one level below.
326  *
327  * Return: 0 on success, negative error code on error.
328  */
329 int xe_pt_create_scratch(struct xe_device *xe, struct xe_gt *gt,
330 			 struct xe_vm *vm)
331 {
332 	u8 id = gt->info.id;
333 	int i;
334 
335 	vm->scratch_bo[id] = xe_bo_create(xe, gt, vm, SZ_4K,
336 					  ttm_bo_type_kernel,
337 					  XE_BO_CREATE_VRAM_IF_DGFX(gt) |
338 					  XE_BO_CREATE_IGNORE_MIN_PAGE_SIZE_BIT |
339 					  XE_BO_CREATE_PINNED_BIT);
340 	if (IS_ERR(vm->scratch_bo[id]))
341 		return PTR_ERR(vm->scratch_bo[id]);
342 	xe_bo_pin(vm->scratch_bo[id]);
343 
344 	for (i = 0; i < vm->pt_root[id]->level; i++) {
345 		vm->scratch_pt[id][i] = xe_pt_create(vm, gt, i);
346 		if (IS_ERR(vm->scratch_pt[id][i]))
347 			return PTR_ERR(vm->scratch_pt[id][i]);
348 
349 		xe_pt_populate_empty(gt, vm, vm->scratch_pt[id][i]);
350 	}
351 
352 	return 0;
353 }
354 
355 /**
356  * DOC: Pagetable building
357  *
358  * Below we use the term "page-table" for both page-directories, containing
359  * pointers to lower level page-directories or page-tables, and level 0
360  * page-tables that contain only page-table-entries pointing to memory pages.
361  *
362  * When inserting an address range in an already existing page-table tree
363  * there will typically be a set of page-tables that are shared with other
364  * address ranges, and a set that are private to this address range.
365  * The set of shared page-tables can be at most two per level,
366  * and those can't be updated immediately because the entries of those
367  * page-tables may still be in use by the gpu for other mappings. Therefore
368  * when inserting entries into those, we instead stage those insertions by
369  * adding insertion data into struct xe_vm_pgtable_update structures. This
370  * data, (subtrees for the cpu and page-table-entries for the gpu) is then
371  * added in a separate commit step. CPU-data is committed while still under the
372  * vm lock, the object lock and for userptr, the notifier lock in read mode.
373  * The GPU async data is committed either by the GPU or CPU after fulfilling
374  * relevant dependencies.
375  * For non-shared page-tables (and, in fact, for shared ones that aren't
376  * existing at the time of staging), we add the data in-place without the
377  * special update structures. This private part of the page-table tree will
378  * remain disconnected from the vm page-table tree until data is committed to
379  * the shared page tables of the vm tree in the commit phase.
380  */
381 
382 struct xe_pt_update {
383 	/** @update: The update structure we're building for this parent. */
384 	struct xe_vm_pgtable_update *update;
385 	/** @parent: The parent. Used to detect a parent change. */
386 	struct xe_pt *parent;
387 	/** @preexisting: Whether the parent was pre-existing or allocated */
388 	bool preexisting;
389 };
390 
391 struct xe_pt_stage_bind_walk {
392 	/** base: The base class. */
393 	struct xe_pt_walk base;
394 
395 	/* Input parameters for the walk */
396 	/** @vm: The vm we're building for. */
397 	struct xe_vm *vm;
398 	/** @gt: The gt we're building for. */
399 	struct xe_gt *gt;
400 	/** @cache: Desired cache level for the ptes */
401 	enum xe_cache_level cache;
402 	/** @default_pte: PTE flag only template. No address is associated */
403 	u64 default_pte;
404 	/** @dma_offset: DMA offset to add to the PTE. */
405 	u64 dma_offset;
406 	/**
407 	 * @needs_64k: This address range enforces 64K alignment and
408 	 * granularity.
409 	 */
410 	bool needs_64K;
411 	/**
412 	 * @pte_flags: Flags determining PTE setup. These are not flags
413 	 * encoded directly in the PTE. See @default_pte for those.
414 	 */
415 	u32 pte_flags;
416 
417 	/* Also input, but is updated during the walk*/
418 	/** @curs: The DMA address cursor. */
419 	struct xe_res_cursor *curs;
420 	/** @va_curs_start: The Virtual address coresponding to @curs->start */
421 	u64 va_curs_start;
422 
423 	/* Output */
424 	struct xe_walk_update {
425 		/** @wupd.entries: Caller provided storage. */
426 		struct xe_vm_pgtable_update *entries;
427 		/** @wupd.num_used_entries: Number of update @entries used. */
428 		unsigned int num_used_entries;
429 		/** @wupd.updates: Tracks the update entry at a given level */
430 		struct xe_pt_update updates[XE_VM_MAX_LEVEL + 1];
431 	} wupd;
432 
433 	/* Walk state */
434 	/**
435 	 * @l0_end_addr: The end address of the current l0 leaf. Used for
436 	 * 64K granularity detection.
437 	 */
438 	u64 l0_end_addr;
439 	/** @addr_64K: The start address of the current 64K chunk. */
440 	u64 addr_64K;
441 	/** @found_64: Whether @add_64K actually points to a 64K chunk. */
442 	bool found_64K;
443 };
444 
445 static int
446 xe_pt_new_shared(struct xe_walk_update *wupd, struct xe_pt *parent,
447 		 pgoff_t offset, bool alloc_entries)
448 {
449 	struct xe_pt_update *upd = &wupd->updates[parent->level];
450 	struct xe_vm_pgtable_update *entry;
451 
452 	/*
453 	 * For *each level*, we could only have one active
454 	 * struct xt_pt_update at any one time. Once we move on to a
455 	 * new parent and page-directory, the old one is complete, and
456 	 * updates are either already stored in the build tree or in
457 	 * @wupd->entries
458 	 */
459 	if (likely(upd->parent == parent))
460 		return 0;
461 
462 	upd->parent = parent;
463 	upd->preexisting = true;
464 
465 	if (wupd->num_used_entries == XE_VM_MAX_LEVEL * 2 + 1)
466 		return -EINVAL;
467 
468 	entry = wupd->entries + wupd->num_used_entries++;
469 	upd->update = entry;
470 	entry->ofs = offset;
471 	entry->pt_bo = parent->bo;
472 	entry->pt = parent;
473 	entry->flags = 0;
474 	entry->qwords = 0;
475 
476 	if (alloc_entries) {
477 		entry->pt_entries = kmalloc_array(GEN8_PDES,
478 						  sizeof(*entry->pt_entries),
479 						  GFP_KERNEL);
480 		if (!entry->pt_entries)
481 			return -ENOMEM;
482 	}
483 
484 	return 0;
485 }
486 
487 /*
488  * NOTE: This is a very frequently called function so we allow ourselves
489  * to annotate (using branch prediction hints) the fastpath of updating a
490  * non-pre-existing pagetable with leaf ptes.
491  */
492 static int
493 xe_pt_insert_entry(struct xe_pt_stage_bind_walk *xe_walk, struct xe_pt *parent,
494 		   pgoff_t offset, struct xe_pt *xe_child, u64 pte)
495 {
496 	struct xe_pt_update *upd = &xe_walk->wupd.updates[parent->level];
497 	struct xe_pt_update *child_upd = xe_child ?
498 		&xe_walk->wupd.updates[xe_child->level] : NULL;
499 	int ret;
500 
501 	ret = xe_pt_new_shared(&xe_walk->wupd, parent, offset, true);
502 	if (unlikely(ret))
503 		return ret;
504 
505 	/*
506 	 * Register this new pagetable so that it won't be recognized as
507 	 * a shared pagetable by a subsequent insertion.
508 	 */
509 	if (unlikely(child_upd)) {
510 		child_upd->update = NULL;
511 		child_upd->parent = xe_child;
512 		child_upd->preexisting = false;
513 	}
514 
515 	if (likely(!upd->preexisting)) {
516 		/* Continue building a non-connected subtree. */
517 		struct iosys_map *map = &parent->bo->vmap;
518 
519 		if (unlikely(xe_child))
520 			parent->base.dir->entries[offset] = &xe_child->base;
521 
522 		xe_pt_write(xe_walk->vm->xe, map, offset, pte);
523 		parent->num_live++;
524 	} else {
525 		/* Shared pt. Stage update. */
526 		unsigned int idx;
527 		struct xe_vm_pgtable_update *entry = upd->update;
528 
529 		idx = offset - entry->ofs;
530 		entry->pt_entries[idx].pt = xe_child;
531 		entry->pt_entries[idx].pte = pte;
532 		entry->qwords++;
533 	}
534 
535 	return 0;
536 }
537 
538 static bool xe_pt_hugepte_possible(u64 addr, u64 next, unsigned int level,
539 				   struct xe_pt_stage_bind_walk *xe_walk)
540 {
541 	u64 size, dma;
542 
543 	/* Does the virtual range requested cover a huge pte? */
544 	if (!xe_pt_covers(addr, next, level, &xe_walk->base))
545 		return false;
546 
547 	/* Does the DMA segment cover the whole pte? */
548 	if (next - xe_walk->va_curs_start > xe_walk->curs->size)
549 		return false;
550 
551 	/* Is the DMA address huge PTE size aligned? */
552 	size = next - addr;
553 	dma = addr - xe_walk->va_curs_start + xe_res_dma(xe_walk->curs);
554 
555 	return IS_ALIGNED(dma, size);
556 }
557 
558 /*
559  * Scan the requested mapping to check whether it can be done entirely
560  * with 64K PTEs.
561  */
562 static bool
563 xe_pt_scan_64K(u64 addr, u64 next, struct xe_pt_stage_bind_walk *xe_walk)
564 {
565 	struct xe_res_cursor curs = *xe_walk->curs;
566 
567 	if (!IS_ALIGNED(addr, SZ_64K))
568 		return false;
569 
570 	if (next > xe_walk->l0_end_addr)
571 		return false;
572 
573 	xe_res_next(&curs, addr - xe_walk->va_curs_start);
574 	for (; addr < next; addr += SZ_64K) {
575 		if (!IS_ALIGNED(xe_res_dma(&curs), SZ_64K) || curs.size < SZ_64K)
576 			return false;
577 
578 		xe_res_next(&curs, SZ_64K);
579 	}
580 
581 	return addr == next;
582 }
583 
584 /*
585  * For non-compact "normal" 4K level-0 pagetables, we want to try to group
586  * addresses together in 64K-contigous regions to add a 64K TLB hint for the
587  * device to the PTE.
588  * This function determines whether the address is part of such a
589  * segment. For VRAM in normal pagetables, this is strictly necessary on
590  * some devices.
591  */
592 static bool
593 xe_pt_is_pte_ps64K(u64 addr, u64 next, struct xe_pt_stage_bind_walk *xe_walk)
594 {
595 	/* Address is within an already found 64k region */
596 	if (xe_walk->found_64K && addr - xe_walk->addr_64K < SZ_64K)
597 		return true;
598 
599 	xe_walk->found_64K = xe_pt_scan_64K(addr, addr + SZ_64K, xe_walk);
600 	xe_walk->addr_64K = addr;
601 
602 	return xe_walk->found_64K;
603 }
604 
605 static int
606 xe_pt_stage_bind_entry(struct xe_ptw *parent, pgoff_t offset,
607 		       unsigned int level, u64 addr, u64 next,
608 		       struct xe_ptw **child,
609 		       enum page_walk_action *action,
610 		       struct xe_pt_walk *walk)
611 {
612 	struct xe_pt_stage_bind_walk *xe_walk =
613 		container_of(walk, typeof(*xe_walk), base);
614 	struct xe_pt *xe_parent = container_of(parent, typeof(*xe_parent), base);
615 	struct xe_pt *xe_child;
616 	bool covers;
617 	int ret = 0;
618 	u64 pte;
619 
620 	/* Is this a leaf entry ?*/
621 	if (level == 0 || xe_pt_hugepte_possible(addr, next, level, xe_walk)) {
622 		struct xe_res_cursor *curs = xe_walk->curs;
623 
624 		XE_WARN_ON(xe_walk->va_curs_start != addr);
625 
626 		pte = __gen8_pte_encode(xe_res_dma(curs) + xe_walk->dma_offset,
627 					xe_walk->cache, xe_walk->pte_flags,
628 					level);
629 		pte |= xe_walk->default_pte;
630 
631 		/*
632 		 * Set the GEN12_PTE_PS64 hint if possible, otherwise if
633 		 * this device *requires* 64K PTE size for VRAM, fail.
634 		 */
635 		if (level == 0 && !xe_parent->is_compact) {
636 			if (xe_pt_is_pte_ps64K(addr, next, xe_walk))
637 				pte |= GEN12_PTE_PS64;
638 			else if (XE_WARN_ON(xe_walk->needs_64K))
639 				return -EINVAL;
640 		}
641 
642 		ret = xe_pt_insert_entry(xe_walk, xe_parent, offset, NULL, pte);
643 		if (unlikely(ret))
644 			return ret;
645 
646 		xe_res_next(curs, next - addr);
647 		xe_walk->va_curs_start = next;
648 		*action = ACTION_CONTINUE;
649 
650 		return ret;
651 	}
652 
653 	/*
654 	 * Descending to lower level. Determine if we need to allocate a
655 	 * new page table or -directory, which we do if there is no
656 	 * previous one or there is one we can completely replace.
657 	 */
658 	if (level == 1) {
659 		walk->shifts = xe_normal_pt_shifts;
660 		xe_walk->l0_end_addr = next;
661 	}
662 
663 	covers = xe_pt_covers(addr, next, level, &xe_walk->base);
664 	if (covers || !*child) {
665 		u64 flags = 0;
666 
667 		xe_child = xe_pt_create(xe_walk->vm, xe_walk->gt, level - 1);
668 		if (IS_ERR(xe_child))
669 			return PTR_ERR(xe_child);
670 
671 		xe_pt_set_addr(xe_child,
672 			       round_down(addr, 1ull << walk->shifts[level]));
673 
674 		if (!covers)
675 			xe_pt_populate_empty(xe_walk->gt, xe_walk->vm, xe_child);
676 
677 		*child = &xe_child->base;
678 
679 		/*
680 		 * Prefer the compact pagetable layout for L0 if possible.
681 		 * TODO: Suballocate the pt bo to avoid wasting a lot of
682 		 * memory.
683 		 */
684 		if (GRAPHICS_VERx100(xe_walk->gt->xe) >= 1250 && level == 1 &&
685 		    covers && xe_pt_scan_64K(addr, next, xe_walk)) {
686 			walk->shifts = xe_compact_pt_shifts;
687 			flags |= GEN12_PDE_64K;
688 			xe_child->is_compact = true;
689 		}
690 
691 		pte = gen8_pde_encode(xe_child->bo, 0, xe_walk->cache) | flags;
692 		ret = xe_pt_insert_entry(xe_walk, xe_parent, offset, xe_child,
693 					 pte);
694 	}
695 
696 	*action = ACTION_SUBTREE;
697 	return ret;
698 }
699 
700 static const struct xe_pt_walk_ops xe_pt_stage_bind_ops = {
701 	.pt_entry = xe_pt_stage_bind_entry,
702 };
703 
704 /**
705  * xe_pt_stage_bind() - Build a disconnected page-table tree for a given address
706  * range.
707  * @gt: The gt we're building for.
708  * @vma: The vma indicating the address range.
709  * @entries: Storage for the update entries used for connecting the tree to
710  * the main tree at commit time.
711  * @num_entries: On output contains the number of @entries used.
712  *
713  * This function builds a disconnected page-table tree for a given address
714  * range. The tree is connected to the main vm tree for the gpu using
715  * xe_migrate_update_pgtables() and for the cpu using xe_pt_commit_bind().
716  * The function builds xe_vm_pgtable_update structures for already existing
717  * shared page-tables, and non-existing shared and non-shared page-tables
718  * are built and populated directly.
719  *
720  * Return 0 on success, negative error code on error.
721  */
722 static int
723 xe_pt_stage_bind(struct xe_gt *gt, struct xe_vma *vma,
724 		 struct xe_vm_pgtable_update *entries, u32 *num_entries)
725 {
726 	struct xe_bo *bo = vma->bo;
727 	bool is_vram = !xe_vma_is_userptr(vma) && bo && xe_bo_is_vram(bo);
728 	struct xe_res_cursor curs;
729 	struct xe_pt_stage_bind_walk xe_walk = {
730 		.base = {
731 			.ops = &xe_pt_stage_bind_ops,
732 			.shifts = xe_normal_pt_shifts,
733 			.max_level = XE_PT_HIGHEST_LEVEL,
734 		},
735 		.vm = vma->vm,
736 		.gt = gt,
737 		.curs = &curs,
738 		.va_curs_start = vma->start,
739 		.pte_flags = vma->pte_flags,
740 		.wupd.entries = entries,
741 		.needs_64K = (vma->vm->flags & XE_VM_FLAGS_64K) && is_vram,
742 	};
743 	struct xe_pt *pt = vma->vm->pt_root[gt->info.id];
744 	int ret;
745 
746 	if (is_vram) {
747 		xe_walk.default_pte = GEN12_PPGTT_PTE_LM;
748 		if (vma && vma->use_atomic_access_pte_bit)
749 			xe_walk.default_pte |= GEN12_USM_PPGTT_PTE_AE;
750 		xe_walk.dma_offset = gt->mem.vram.io_start -
751 			gt_to_xe(gt)->mem.vram.io_start;
752 		xe_walk.cache = XE_CACHE_WB;
753 	} else {
754 		if (!xe_vma_is_userptr(vma) && bo->flags & XE_BO_SCANOUT_BIT)
755 			xe_walk.cache = XE_CACHE_WT;
756 		else
757 			xe_walk.cache = XE_CACHE_WB;
758 	}
759 
760 	xe_bo_assert_held(bo);
761 	if (xe_vma_is_userptr(vma))
762 		xe_res_first_sg(vma->userptr.sg, 0, vma->end - vma->start + 1,
763 				&curs);
764 	else if (xe_bo_is_vram(bo))
765 		xe_res_first(bo->ttm.resource, vma->bo_offset,
766 			     vma->end - vma->start + 1, &curs);
767 	else
768 		xe_res_first_sg(xe_bo_get_sg(bo), vma->bo_offset,
769 				vma->end - vma->start + 1, &curs);
770 
771 	ret = xe_pt_walk_range(&pt->base, pt->level, vma->start, vma->end + 1,
772 				&xe_walk.base);
773 
774 	*num_entries = xe_walk.wupd.num_used_entries;
775 	return ret;
776 }
777 
778 /**
779  * xe_pt_nonshared_offsets() - Determine the non-shared entry offsets of a
780  * shared pagetable.
781  * @addr: The start address within the non-shared pagetable.
782  * @end: The end address within the non-shared pagetable.
783  * @level: The level of the non-shared pagetable.
784  * @walk: Walk info. The function adjusts the walk action.
785  * @action: next action to perform (see enum page_walk_action)
786  * @offset: Ignored on input, First non-shared entry on output.
787  * @end_offset: Ignored on input, Last non-shared entry + 1 on output.
788  *
789  * A non-shared page-table has some entries that belong to the address range
790  * and others that don't. This function determines the entries that belong
791  * fully to the address range. Depending on level, some entries may
792  * partially belong to the address range (that can't happen at level 0).
793  * The function detects that and adjust those offsets to not include those
794  * partial entries. Iff it does detect partial entries, we know that there must
795  * be shared page tables also at lower levels, so it adjusts the walk action
796  * accordingly.
797  *
798  * Return: true if there were non-shared entries, false otherwise.
799  */
800 static bool xe_pt_nonshared_offsets(u64 addr, u64 end, unsigned int level,
801 				    struct xe_pt_walk *walk,
802 				    enum page_walk_action *action,
803 				    pgoff_t *offset, pgoff_t *end_offset)
804 {
805 	u64 size = 1ull << walk->shifts[level];
806 
807 	*offset = xe_pt_offset(addr, level, walk);
808 	*end_offset = xe_pt_num_entries(addr, end, level, walk) + *offset;
809 
810 	if (!level)
811 		return true;
812 
813 	/*
814 	 * If addr or next are not size aligned, there are shared pts at lower
815 	 * level, so in that case traverse down the subtree
816 	 */
817 	*action = ACTION_CONTINUE;
818 	if (!IS_ALIGNED(addr, size)) {
819 		*action = ACTION_SUBTREE;
820 		(*offset)++;
821 	}
822 
823 	if (!IS_ALIGNED(end, size)) {
824 		*action = ACTION_SUBTREE;
825 		(*end_offset)--;
826 	}
827 
828 	return *end_offset > *offset;
829 }
830 
831 struct xe_pt_zap_ptes_walk {
832 	/** @base: The walk base-class */
833 	struct xe_pt_walk base;
834 
835 	/* Input parameters for the walk */
836 	/** @gt: The gt we're building for */
837 	struct xe_gt *gt;
838 
839 	/* Output */
840 	/** @needs_invalidate: Whether we need to invalidate TLB*/
841 	bool needs_invalidate;
842 };
843 
844 static int xe_pt_zap_ptes_entry(struct xe_ptw *parent, pgoff_t offset,
845 				unsigned int level, u64 addr, u64 next,
846 				struct xe_ptw **child,
847 				enum page_walk_action *action,
848 				struct xe_pt_walk *walk)
849 {
850 	struct xe_pt_zap_ptes_walk *xe_walk =
851 		container_of(walk, typeof(*xe_walk), base);
852 	struct xe_pt *xe_child = container_of(*child, typeof(*xe_child), base);
853 	pgoff_t end_offset;
854 
855 	XE_BUG_ON(!*child);
856 	XE_BUG_ON(!level && xe_child->is_compact);
857 
858 	/*
859 	 * Note that we're called from an entry callback, and we're dealing
860 	 * with the child of that entry rather than the parent, so need to
861 	 * adjust level down.
862 	 */
863 	if (xe_pt_nonshared_offsets(addr, next, --level, walk, action, &offset,
864 				    &end_offset)) {
865 		xe_map_memset(gt_to_xe(xe_walk->gt), &xe_child->bo->vmap,
866 			      offset * sizeof(u64), 0,
867 			      (end_offset - offset) * sizeof(u64));
868 		xe_walk->needs_invalidate = true;
869 	}
870 
871 	return 0;
872 }
873 
874 static const struct xe_pt_walk_ops xe_pt_zap_ptes_ops = {
875 	.pt_entry = xe_pt_zap_ptes_entry,
876 };
877 
878 /**
879  * xe_pt_zap_ptes() - Zap (zero) gpu ptes of an address range
880  * @gt: The gt we're zapping for.
881  * @vma: GPU VMA detailing address range.
882  *
883  * Eviction and Userptr invalidation needs to be able to zap the
884  * gpu ptes of a given address range in pagefaulting mode.
885  * In order to be able to do that, that function needs access to the shared
886  * page-table entrieaso it can either clear the leaf PTEs or
887  * clear the pointers to lower-level page-tables. The caller is required
888  * to hold the necessary locks to ensure neither the page-table connectivity
889  * nor the page-table entries of the range is updated from under us.
890  *
891  * Return: Whether ptes were actually updated and a TLB invalidation is
892  * required.
893  */
894 bool xe_pt_zap_ptes(struct xe_gt *gt, struct xe_vma *vma)
895 {
896 	struct xe_pt_zap_ptes_walk xe_walk = {
897 		.base = {
898 			.ops = &xe_pt_zap_ptes_ops,
899 			.shifts = xe_normal_pt_shifts,
900 			.max_level = XE_PT_HIGHEST_LEVEL,
901 		},
902 		.gt = gt,
903 	};
904 	struct xe_pt *pt = vma->vm->pt_root[gt->info.id];
905 
906 	if (!(vma->gt_present & BIT(gt->info.id)))
907 		return false;
908 
909 	(void)xe_pt_walk_shared(&pt->base, pt->level, vma->start, vma->end + 1,
910 				 &xe_walk.base);
911 
912 	return xe_walk.needs_invalidate;
913 }
914 
915 static void
916 xe_vm_populate_pgtable(struct xe_migrate_pt_update *pt_update, struct xe_gt *gt,
917 		       struct iosys_map *map, void *data,
918 		       u32 qword_ofs, u32 num_qwords,
919 		       const struct xe_vm_pgtable_update *update)
920 {
921 	struct xe_pt_entry *ptes = update->pt_entries;
922 	u64 *ptr = data;
923 	u32 i;
924 
925 	XE_BUG_ON(xe_gt_is_media_type(gt));
926 
927 	for (i = 0; i < num_qwords; i++) {
928 		if (map)
929 			xe_map_wr(gt_to_xe(gt), map, (qword_ofs + i) *
930 				  sizeof(u64), u64, ptes[i].pte);
931 		else
932 			ptr[i] = ptes[i].pte;
933 	}
934 }
935 
936 static void xe_pt_abort_bind(struct xe_vma *vma,
937 			     struct xe_vm_pgtable_update *entries,
938 			     u32 num_entries)
939 {
940 	u32 i, j;
941 
942 	for (i = 0; i < num_entries; i++) {
943 		if (!entries[i].pt_entries)
944 			continue;
945 
946 		for (j = 0; j < entries[i].qwords; j++)
947 			xe_pt_destroy(entries[i].pt_entries[j].pt, vma->vm->flags, NULL);
948 		kfree(entries[i].pt_entries);
949 	}
950 }
951 
952 static void xe_pt_commit_locks_assert(struct xe_vma *vma)
953 {
954 	struct xe_vm *vm = vma->vm;
955 
956 	lockdep_assert_held(&vm->lock);
957 
958 	if (xe_vma_is_userptr(vma))
959 		lockdep_assert_held_read(&vm->userptr.notifier_lock);
960 	else
961 		dma_resv_assert_held(vma->bo->ttm.base.resv);
962 
963 	dma_resv_assert_held(&vm->resv);
964 }
965 
966 static void xe_pt_commit_bind(struct xe_vma *vma,
967 			      struct xe_vm_pgtable_update *entries,
968 			      u32 num_entries, bool rebind,
969 			      struct llist_head *deferred)
970 {
971 	u32 i, j;
972 
973 	xe_pt_commit_locks_assert(vma);
974 
975 	for (i = 0; i < num_entries; i++) {
976 		struct xe_pt *pt = entries[i].pt;
977 		struct xe_pt_dir *pt_dir;
978 
979 		if (!rebind)
980 			pt->num_live += entries[i].qwords;
981 
982 		if (!pt->level) {
983 			kfree(entries[i].pt_entries);
984 			continue;
985 		}
986 
987 		pt_dir = as_xe_pt_dir(pt);
988 		for (j = 0; j < entries[i].qwords; j++) {
989 			u32 j_ = j + entries[i].ofs;
990 			struct xe_pt *newpte = entries[i].pt_entries[j].pt;
991 
992 			if (xe_pt_entry(pt_dir, j_))
993 				xe_pt_destroy(xe_pt_entry(pt_dir, j_),
994 					      vma->vm->flags, deferred);
995 
996 			pt_dir->dir.entries[j_] = &newpte->base;
997 		}
998 		kfree(entries[i].pt_entries);
999 	}
1000 }
1001 
1002 static int
1003 xe_pt_prepare_bind(struct xe_gt *gt, struct xe_vma *vma,
1004 		   struct xe_vm_pgtable_update *entries, u32 *num_entries,
1005 		   bool rebind)
1006 {
1007 	int err;
1008 
1009 	*num_entries = 0;
1010 	err = xe_pt_stage_bind(gt, vma, entries, num_entries);
1011 	if (!err)
1012 		BUG_ON(!*num_entries);
1013 	else /* abort! */
1014 		xe_pt_abort_bind(vma, entries, *num_entries);
1015 
1016 	return err;
1017 }
1018 
1019 static void xe_vm_dbg_print_entries(struct xe_device *xe,
1020 				    const struct xe_vm_pgtable_update *entries,
1021 				    unsigned int num_entries)
1022 #if (IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM))
1023 {
1024 	unsigned int i;
1025 
1026 	vm_dbg(&xe->drm, "%u entries to update\n", num_entries);
1027 	for (i = 0; i < num_entries; i++) {
1028 		const struct xe_vm_pgtable_update *entry = &entries[i];
1029 		struct xe_pt *xe_pt = entry->pt;
1030 		u64 page_size = 1ull << xe_pt_shift(xe_pt->level);
1031 		u64 end;
1032 		u64 start;
1033 
1034 		XE_BUG_ON(entry->pt->is_compact);
1035 		start = entry->ofs * page_size;
1036 		end = start + page_size * entry->qwords;
1037 		vm_dbg(&xe->drm,
1038 		       "\t%u: Update level %u at (%u + %u) [%llx...%llx) f:%x\n",
1039 		       i, xe_pt->level, entry->ofs, entry->qwords,
1040 		       xe_pt_addr(xe_pt) + start, xe_pt_addr(xe_pt) + end, 0);
1041 	}
1042 }
1043 #else
1044 {}
1045 #endif
1046 
1047 #ifdef CONFIG_DRM_XE_USERPTR_INVAL_INJECT
1048 
1049 static int xe_pt_userptr_inject_eagain(struct xe_vma *vma)
1050 {
1051 	u32 divisor = vma->userptr.divisor ? vma->userptr.divisor : 2;
1052 	static u32 count;
1053 
1054 	if (count++ % divisor == divisor - 1) {
1055 		struct xe_vm *vm = vma->vm;
1056 
1057 		vma->userptr.divisor = divisor << 1;
1058 		spin_lock(&vm->userptr.invalidated_lock);
1059 		list_move_tail(&vma->userptr.invalidate_link,
1060 			       &vm->userptr.invalidated);
1061 		spin_unlock(&vm->userptr.invalidated_lock);
1062 		return true;
1063 	}
1064 
1065 	return false;
1066 }
1067 
1068 #else
1069 
1070 static bool xe_pt_userptr_inject_eagain(struct xe_vma *vma)
1071 {
1072 	return false;
1073 }
1074 
1075 #endif
1076 
1077 /**
1078  * struct xe_pt_migrate_pt_update - Callback argument for pre-commit callbacks
1079  * @base: Base we derive from.
1080  * @bind: Whether this is a bind or an unbind operation. A bind operation
1081  *        makes the pre-commit callback error with -EAGAIN if it detects a
1082  *        pending invalidation.
1083  * @locked: Whether the pre-commit callback locked the userptr notifier lock
1084  *          and it needs unlocking.
1085  */
1086 struct xe_pt_migrate_pt_update {
1087 	struct xe_migrate_pt_update base;
1088 	bool bind;
1089 	bool locked;
1090 };
1091 
1092 static int xe_pt_userptr_pre_commit(struct xe_migrate_pt_update *pt_update)
1093 {
1094 	struct xe_pt_migrate_pt_update *userptr_update =
1095 		container_of(pt_update, typeof(*userptr_update), base);
1096 	struct xe_vma *vma = pt_update->vma;
1097 	unsigned long notifier_seq = vma->userptr.notifier_seq;
1098 	struct xe_vm *vm = vma->vm;
1099 
1100 	userptr_update->locked = false;
1101 
1102 	/*
1103 	 * Wait until nobody is running the invalidation notifier, and
1104 	 * since we're exiting the loop holding the notifier lock,
1105 	 * nobody can proceed invalidating either.
1106 	 *
1107 	 * Note that we don't update the vma->userptr.notifier_seq since
1108 	 * we don't update the userptr pages.
1109 	 */
1110 	do {
1111 		down_read(&vm->userptr.notifier_lock);
1112 		if (!mmu_interval_read_retry(&vma->userptr.notifier,
1113 					     notifier_seq))
1114 			break;
1115 
1116 		up_read(&vm->userptr.notifier_lock);
1117 
1118 		if (userptr_update->bind)
1119 			return -EAGAIN;
1120 
1121 		notifier_seq = mmu_interval_read_begin(&vma->userptr.notifier);
1122 	} while (true);
1123 
1124 	/* Inject errors to test_whether they are handled correctly */
1125 	if (userptr_update->bind && xe_pt_userptr_inject_eagain(vma)) {
1126 		up_read(&vm->userptr.notifier_lock);
1127 		return -EAGAIN;
1128 	}
1129 
1130 	userptr_update->locked = true;
1131 
1132 	return 0;
1133 }
1134 
1135 static const struct xe_migrate_pt_update_ops bind_ops = {
1136 	.populate = xe_vm_populate_pgtable,
1137 };
1138 
1139 static const struct xe_migrate_pt_update_ops userptr_bind_ops = {
1140 	.populate = xe_vm_populate_pgtable,
1141 	.pre_commit = xe_pt_userptr_pre_commit,
1142 };
1143 
1144 /**
1145  * __xe_pt_bind_vma() - Build and connect a page-table tree for the vma
1146  * address range.
1147  * @gt: The gt to bind for.
1148  * @vma: The vma to bind.
1149  * @e: The engine with which to do pipelined page-table updates.
1150  * @syncs: Entries to sync on before binding the built tree to the live vm tree.
1151  * @num_syncs: Number of @sync entries.
1152  * @rebind: Whether we're rebinding this vma to the same address range without
1153  * an unbind in-between.
1154  *
1155  * This function builds a page-table tree (see xe_pt_stage_bind() for more
1156  * information on page-table building), and the xe_vm_pgtable_update entries
1157  * abstracting the operations needed to attach it to the main vm tree. It
1158  * then takes the relevant locks and updates the metadata side of the main
1159  * vm tree and submits the operations for pipelined attachment of the
1160  * gpu page-table to the vm main tree, (which can be done either by the
1161  * cpu and the GPU).
1162  *
1163  * Return: A valid dma-fence representing the pipelined attachment operation
1164  * on success, an error pointer on error.
1165  */
1166 struct dma_fence *
1167 __xe_pt_bind_vma(struct xe_gt *gt, struct xe_vma *vma, struct xe_engine *e,
1168 		 struct xe_sync_entry *syncs, u32 num_syncs,
1169 		 bool rebind)
1170 {
1171 	struct xe_vm_pgtable_update entries[XE_VM_MAX_LEVEL * 2 + 1];
1172 	struct xe_pt_migrate_pt_update bind_pt_update = {
1173 		.base = {
1174 			.ops = xe_vma_is_userptr(vma) ? &userptr_bind_ops : &bind_ops,
1175 			.vma = vma,
1176 		},
1177 		.bind = true,
1178 	};
1179 	struct xe_vm *vm = vma->vm;
1180 	u32 num_entries;
1181 	struct dma_fence *fence;
1182 	int err;
1183 
1184 	bind_pt_update.locked = false;
1185 	xe_bo_assert_held(vma->bo);
1186 	xe_vm_assert_held(vm);
1187 	XE_BUG_ON(xe_gt_is_media_type(gt));
1188 
1189 	vm_dbg(&vma->vm->xe->drm,
1190 	       "Preparing bind, with range [%llx...%llx) engine %p.\n",
1191 	       vma->start, vma->end, e);
1192 
1193 	err = xe_pt_prepare_bind(gt, vma, entries, &num_entries, rebind);
1194 	if (err)
1195 		goto err;
1196 	XE_BUG_ON(num_entries > ARRAY_SIZE(entries));
1197 
1198 	xe_vm_dbg_print_entries(gt_to_xe(gt), entries, num_entries);
1199 
1200 	fence = xe_migrate_update_pgtables(gt->migrate,
1201 					   vm, vma->bo,
1202 					   e ? e : vm->eng[gt->info.id],
1203 					   entries, num_entries,
1204 					   syncs, num_syncs,
1205 					   &bind_pt_update.base);
1206 	if (!IS_ERR(fence)) {
1207 		LLIST_HEAD(deferred);
1208 
1209 		/* add shared fence now for pagetable delayed destroy */
1210 		dma_resv_add_fence(&vm->resv, fence, !rebind &&
1211 				   vma->last_munmap_rebind ?
1212 				   DMA_RESV_USAGE_KERNEL :
1213 				   DMA_RESV_USAGE_BOOKKEEP);
1214 
1215 		if (!xe_vma_is_userptr(vma) && !vma->bo->vm)
1216 			dma_resv_add_fence(vma->bo->ttm.base.resv, fence,
1217 					   DMA_RESV_USAGE_BOOKKEEP);
1218 		xe_pt_commit_bind(vma, entries, num_entries, rebind,
1219 				  bind_pt_update.locked ? &deferred : NULL);
1220 
1221 		/* This vma is live (again?) now */
1222 		vma->gt_present |= BIT(gt->info.id);
1223 
1224 		if (bind_pt_update.locked) {
1225 			vma->userptr.initial_bind = true;
1226 			up_read(&vm->userptr.notifier_lock);
1227 			xe_bo_put_commit(&deferred);
1228 		}
1229 		if (!rebind && vma->last_munmap_rebind &&
1230 		    xe_vm_in_compute_mode(vm))
1231 			queue_work(vm->xe->ordered_wq,
1232 				   &vm->preempt.rebind_work);
1233 	} else {
1234 		if (bind_pt_update.locked)
1235 			up_read(&vm->userptr.notifier_lock);
1236 		xe_pt_abort_bind(vma, entries, num_entries);
1237 	}
1238 
1239 	return fence;
1240 
1241 err:
1242 	return ERR_PTR(err);
1243 }
1244 
1245 struct xe_pt_stage_unbind_walk {
1246 	/** @base: The pagewalk base-class. */
1247 	struct xe_pt_walk base;
1248 
1249 	/* Input parameters for the walk */
1250 	/** @gt: The gt we're unbinding from. */
1251 	struct xe_gt *gt;
1252 
1253 	/**
1254 	 * @modified_start: Walk range start, modified to include any
1255 	 * shared pagetables that we're the only user of and can thus
1256 	 * treat as private.
1257 	 */
1258 	u64 modified_start;
1259 	/** @modified_end: Walk range start, modified like @modified_start. */
1260 	u64 modified_end;
1261 
1262 	/* Output */
1263 	/* @wupd: Structure to track the page-table updates we're building */
1264 	struct xe_walk_update wupd;
1265 };
1266 
1267 /*
1268  * Check whether this range is the only one populating this pagetable,
1269  * and in that case, update the walk range checks so that higher levels don't
1270  * view us as a shared pagetable.
1271  */
1272 static bool xe_pt_check_kill(u64 addr, u64 next, unsigned int level,
1273 			     const struct xe_pt *child,
1274 			     enum page_walk_action *action,
1275 			     struct xe_pt_walk *walk)
1276 {
1277 	struct xe_pt_stage_unbind_walk *xe_walk =
1278 		container_of(walk, typeof(*xe_walk), base);
1279 	unsigned int shift = walk->shifts[level];
1280 	u64 size = 1ull << shift;
1281 
1282 	if (IS_ALIGNED(addr, size) && IS_ALIGNED(next, size) &&
1283 	    ((next - addr) >> shift) == child->num_live) {
1284 		u64 size = 1ull << walk->shifts[level + 1];
1285 
1286 		*action = ACTION_CONTINUE;
1287 
1288 		if (xe_walk->modified_start >= addr)
1289 			xe_walk->modified_start = round_down(addr, size);
1290 		if (xe_walk->modified_end <= next)
1291 			xe_walk->modified_end = round_up(next, size);
1292 
1293 		return true;
1294 	}
1295 
1296 	return false;
1297 }
1298 
1299 static int xe_pt_stage_unbind_entry(struct xe_ptw *parent, pgoff_t offset,
1300 				    unsigned int level, u64 addr, u64 next,
1301 				    struct xe_ptw **child,
1302 				    enum page_walk_action *action,
1303 				    struct xe_pt_walk *walk)
1304 {
1305 	struct xe_pt *xe_child = container_of(*child, typeof(*xe_child), base);
1306 
1307 	XE_BUG_ON(!*child);
1308 	XE_BUG_ON(!level && xe_child->is_compact);
1309 
1310 	xe_pt_check_kill(addr, next, level - 1, xe_child, action, walk);
1311 
1312 	return 0;
1313 }
1314 
1315 static int
1316 xe_pt_stage_unbind_post_descend(struct xe_ptw *parent, pgoff_t offset,
1317 				unsigned int level, u64 addr, u64 next,
1318 				struct xe_ptw **child,
1319 				enum page_walk_action *action,
1320 				struct xe_pt_walk *walk)
1321 {
1322 	struct xe_pt_stage_unbind_walk *xe_walk =
1323 		container_of(walk, typeof(*xe_walk), base);
1324 	struct xe_pt *xe_child = container_of(*child, typeof(*xe_child), base);
1325 	pgoff_t end_offset;
1326 	u64 size = 1ull << walk->shifts[--level];
1327 
1328 	if (!IS_ALIGNED(addr, size))
1329 		addr = xe_walk->modified_start;
1330 	if (!IS_ALIGNED(next, size))
1331 		next = xe_walk->modified_end;
1332 
1333 	/* Parent == *child is the root pt. Don't kill it. */
1334 	if (parent != *child &&
1335 	    xe_pt_check_kill(addr, next, level, xe_child, action, walk))
1336 		return 0;
1337 
1338 	if (!xe_pt_nonshared_offsets(addr, next, level, walk, action, &offset,
1339 				     &end_offset))
1340 		return 0;
1341 
1342 	(void)xe_pt_new_shared(&xe_walk->wupd, xe_child, offset, false);
1343 	xe_walk->wupd.updates[level].update->qwords = end_offset - offset;
1344 
1345 	return 0;
1346 }
1347 
1348 static const struct xe_pt_walk_ops xe_pt_stage_unbind_ops = {
1349 	.pt_entry = xe_pt_stage_unbind_entry,
1350 	.pt_post_descend = xe_pt_stage_unbind_post_descend,
1351 };
1352 
1353 /**
1354  * xe_pt_stage_unbind() - Build page-table update structures for an unbind
1355  * operation
1356  * @gt: The gt we're unbinding for.
1357  * @vma: The vma we're unbinding.
1358  * @entries: Caller-provided storage for the update structures.
1359  *
1360  * Builds page-table update structures for an unbind operation. The function
1361  * will attempt to remove all page-tables that we're the only user
1362  * of, and for that to work, the unbind operation must be committed in the
1363  * same critical section that blocks racing binds to the same page-table tree.
1364  *
1365  * Return: The number of entries used.
1366  */
1367 static unsigned int xe_pt_stage_unbind(struct xe_gt *gt, struct xe_vma *vma,
1368 				       struct xe_vm_pgtable_update *entries)
1369 {
1370 	struct xe_pt_stage_unbind_walk xe_walk = {
1371 		.base = {
1372 			.ops = &xe_pt_stage_unbind_ops,
1373 			.shifts = xe_normal_pt_shifts,
1374 			.max_level = XE_PT_HIGHEST_LEVEL,
1375 		},
1376 		.gt = gt,
1377 		.modified_start = vma->start,
1378 		.modified_end = vma->end + 1,
1379 		.wupd.entries = entries,
1380 	};
1381 	struct xe_pt *pt = vma->vm->pt_root[gt->info.id];
1382 
1383 	(void)xe_pt_walk_shared(&pt->base, pt->level, vma->start, vma->end + 1,
1384 				 &xe_walk.base);
1385 
1386 	return xe_walk.wupd.num_used_entries;
1387 }
1388 
1389 static void
1390 xe_migrate_clear_pgtable_callback(struct xe_migrate_pt_update *pt_update,
1391 				  struct xe_gt *gt, struct iosys_map *map,
1392 				  void *ptr, u32 qword_ofs, u32 num_qwords,
1393 				  const struct xe_vm_pgtable_update *update)
1394 {
1395 	struct xe_vma *vma = pt_update->vma;
1396 	u64 empty = __xe_pt_empty_pte(gt, vma->vm, update->pt->level);
1397 	int i;
1398 
1399 	XE_BUG_ON(xe_gt_is_media_type(gt));
1400 
1401 	if (map && map->is_iomem)
1402 		for (i = 0; i < num_qwords; ++i)
1403 			xe_map_wr(gt_to_xe(gt), map, (qword_ofs + i) *
1404 				  sizeof(u64), u64, empty);
1405 	else if (map)
1406 		memset64(map->vaddr + qword_ofs * sizeof(u64), empty,
1407 			 num_qwords);
1408 	else
1409 		memset64(ptr, empty, num_qwords);
1410 }
1411 
1412 static void
1413 xe_pt_commit_unbind(struct xe_vma *vma,
1414 		    struct xe_vm_pgtable_update *entries, u32 num_entries,
1415 		    struct llist_head *deferred)
1416 {
1417 	u32 j;
1418 
1419 	xe_pt_commit_locks_assert(vma);
1420 
1421 	for (j = 0; j < num_entries; ++j) {
1422 		struct xe_vm_pgtable_update *entry = &entries[j];
1423 		struct xe_pt *pt = entry->pt;
1424 
1425 		pt->num_live -= entry->qwords;
1426 		if (pt->level) {
1427 			struct xe_pt_dir *pt_dir = as_xe_pt_dir(pt);
1428 			u32 i;
1429 
1430 			for (i = entry->ofs; i < entry->ofs + entry->qwords;
1431 			     i++) {
1432 				if (xe_pt_entry(pt_dir, i))
1433 					xe_pt_destroy(xe_pt_entry(pt_dir, i),
1434 						      vma->vm->flags, deferred);
1435 
1436 				pt_dir->dir.entries[i] = NULL;
1437 			}
1438 		}
1439 	}
1440 }
1441 
1442 static const struct xe_migrate_pt_update_ops unbind_ops = {
1443 	.populate = xe_migrate_clear_pgtable_callback,
1444 };
1445 
1446 static const struct xe_migrate_pt_update_ops userptr_unbind_ops = {
1447 	.populate = xe_migrate_clear_pgtable_callback,
1448 	.pre_commit = xe_pt_userptr_pre_commit,
1449 };
1450 
1451 /**
1452  * __xe_pt_unbind_vma() - Disconnect and free a page-table tree for the vma
1453  * address range.
1454  * @gt: The gt to unbind for.
1455  * @vma: The vma to unbind.
1456  * @e: The engine with which to do pipelined page-table updates.
1457  * @syncs: Entries to sync on before disconnecting the tree to be destroyed.
1458  * @num_syncs: Number of @sync entries.
1459  *
1460  * This function builds a the xe_vm_pgtable_update entries abstracting the
1461  * operations needed to detach the page-table tree to be destroyed from the
1462  * man vm tree.
1463  * It then takes the relevant locks and submits the operations for
1464  * pipelined detachment of the gpu page-table from  the vm main tree,
1465  * (which can be done either by the cpu and the GPU), Finally it frees the
1466  * detached page-table tree.
1467  *
1468  * Return: A valid dma-fence representing the pipelined detachment operation
1469  * on success, an error pointer on error.
1470  */
1471 struct dma_fence *
1472 __xe_pt_unbind_vma(struct xe_gt *gt, struct xe_vma *vma, struct xe_engine *e,
1473 		   struct xe_sync_entry *syncs, u32 num_syncs)
1474 {
1475 	struct xe_vm_pgtable_update entries[XE_VM_MAX_LEVEL * 2 + 1];
1476 	struct xe_pt_migrate_pt_update unbind_pt_update = {
1477 		.base = {
1478 			.ops = xe_vma_is_userptr(vma) ? &userptr_unbind_ops :
1479 			&unbind_ops,
1480 			.vma = vma,
1481 		},
1482 	};
1483 	struct xe_vm *vm = vma->vm;
1484 	u32 num_entries;
1485 	struct dma_fence *fence = NULL;
1486 	LLIST_HEAD(deferred);
1487 
1488 	xe_bo_assert_held(vma->bo);
1489 	xe_vm_assert_held(vm);
1490 	XE_BUG_ON(xe_gt_is_media_type(gt));
1491 
1492 	vm_dbg(&vma->vm->xe->drm,
1493 	       "Preparing unbind, with range [%llx...%llx) engine %p.\n",
1494 	       vma->start, vma->end, e);
1495 
1496 	num_entries = xe_pt_stage_unbind(gt, vma, entries);
1497 	XE_BUG_ON(num_entries > ARRAY_SIZE(entries));
1498 
1499 	xe_vm_dbg_print_entries(gt_to_xe(gt), entries, num_entries);
1500 
1501 	/*
1502 	 * Even if we were already evicted and unbind to destroy, we need to
1503 	 * clear again here. The eviction may have updated pagetables at a
1504 	 * lower level, because it needs to be more conservative.
1505 	 */
1506 	fence = xe_migrate_update_pgtables(gt->migrate,
1507 					   vm, NULL, e ? e :
1508 					   vm->eng[gt->info.id],
1509 					   entries, num_entries,
1510 					   syncs, num_syncs,
1511 					   &unbind_pt_update.base);
1512 	if (!IS_ERR(fence)) {
1513 		/* add shared fence now for pagetable delayed destroy */
1514 		dma_resv_add_fence(&vm->resv, fence,
1515 				   DMA_RESV_USAGE_BOOKKEEP);
1516 
1517 		/* This fence will be installed by caller when doing eviction */
1518 		if (!xe_vma_is_userptr(vma) && !vma->bo->vm)
1519 			dma_resv_add_fence(vma->bo->ttm.base.resv, fence,
1520 					   DMA_RESV_USAGE_BOOKKEEP);
1521 		xe_pt_commit_unbind(vma, entries, num_entries,
1522 				    unbind_pt_update.locked ? &deferred : NULL);
1523 		vma->gt_present &= ~BIT(gt->info.id);
1524 	}
1525 
1526 	if (!vma->gt_present)
1527 		list_del_init(&vma->rebind_link);
1528 
1529 	if (unbind_pt_update.locked) {
1530 		XE_WARN_ON(!xe_vma_is_userptr(vma));
1531 
1532 		if (!vma->gt_present) {
1533 			spin_lock(&vm->userptr.invalidated_lock);
1534 			list_del_init(&vma->userptr.invalidate_link);
1535 			spin_unlock(&vm->userptr.invalidated_lock);
1536 		}
1537 		up_read(&vm->userptr.notifier_lock);
1538 		xe_bo_put_commit(&deferred);
1539 	}
1540 
1541 	return fence;
1542 }
1543