xref: /linux/drivers/gpu/drm/xe/xe_pt.c (revision 9112fc0109fc0037ac3b8b633a169e78b4e23ca1)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2022 Intel Corporation
4  */
5 
6 #include "xe_pt.h"
7 
8 #include "xe_bo.h"
9 #include "xe_device.h"
10 #include "xe_drm_client.h"
11 #include "xe_gt.h"
12 #include "xe_gt_tlb_invalidation.h"
13 #include "xe_migrate.h"
14 #include "xe_pt_types.h"
15 #include "xe_pt_walk.h"
16 #include "xe_res_cursor.h"
17 #include "xe_trace.h"
18 #include "xe_ttm_stolen_mgr.h"
19 #include "xe_vm.h"
20 
21 struct xe_pt_dir {
22 	struct xe_pt pt;
23 	/** @children: Array of page-table child nodes */
24 	struct xe_ptw *children[XE_PDES];
25 };
26 
27 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
28 #define xe_pt_set_addr(__xe_pt, __addr) ((__xe_pt)->addr = (__addr))
29 #define xe_pt_addr(__xe_pt) ((__xe_pt)->addr)
30 #else
31 #define xe_pt_set_addr(__xe_pt, __addr)
32 #define xe_pt_addr(__xe_pt) 0ull
33 #endif
34 
35 static const u64 xe_normal_pt_shifts[] = {12, 21, 30, 39, 48};
36 static const u64 xe_compact_pt_shifts[] = {16, 21, 30, 39, 48};
37 
38 #define XE_PT_HIGHEST_LEVEL (ARRAY_SIZE(xe_normal_pt_shifts) - 1)
39 
40 static struct xe_pt_dir *as_xe_pt_dir(struct xe_pt *pt)
41 {
42 	return container_of(pt, struct xe_pt_dir, pt);
43 }
44 
45 static struct xe_pt *xe_pt_entry(struct xe_pt_dir *pt_dir, unsigned int index)
46 {
47 	return container_of(pt_dir->children[index], struct xe_pt, base);
48 }
49 
50 static u64 __xe_pt_empty_pte(struct xe_tile *tile, struct xe_vm *vm,
51 			     unsigned int level)
52 {
53 	struct xe_device *xe = tile_to_xe(tile);
54 	u16 pat_index = xe->pat.idx[XE_CACHE_WB];
55 	u8 id = tile->id;
56 
57 	if (!xe_vm_has_scratch(vm))
58 		return 0;
59 
60 	if (level > MAX_HUGEPTE_LEVEL)
61 		return vm->pt_ops->pde_encode_bo(vm->scratch_pt[id][level - 1]->bo,
62 						 0, pat_index);
63 
64 	return vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, IS_DGFX(xe), 0) |
65 		XE_PTE_NULL;
66 }
67 
68 static void xe_pt_free(struct xe_pt *pt)
69 {
70 	if (pt->level)
71 		kfree(as_xe_pt_dir(pt));
72 	else
73 		kfree(pt);
74 }
75 
76 /**
77  * xe_pt_create() - Create a page-table.
78  * @vm: The vm to create for.
79  * @tile: The tile to create for.
80  * @level: The page-table level.
81  *
82  * Allocate and initialize a single struct xe_pt metadata structure. Also
83  * create the corresponding page-table bo, but don't initialize it. If the
84  * level is grater than zero, then it's assumed to be a directory page-
85  * table and the directory structure is also allocated and initialized to
86  * NULL pointers.
87  *
88  * Return: A valid struct xe_pt pointer on success, Pointer error code on
89  * error.
90  */
91 struct xe_pt *xe_pt_create(struct xe_vm *vm, struct xe_tile *tile,
92 			   unsigned int level)
93 {
94 	struct xe_pt *pt;
95 	struct xe_bo *bo;
96 	int err;
97 
98 	if (level) {
99 		struct xe_pt_dir *dir = kzalloc(sizeof(*dir), GFP_KERNEL);
100 
101 		pt = (dir) ? &dir->pt : NULL;
102 	} else {
103 		pt = kzalloc(sizeof(*pt), GFP_KERNEL);
104 	}
105 	if (!pt)
106 		return ERR_PTR(-ENOMEM);
107 
108 	pt->level = level;
109 	bo = xe_bo_create_pin_map(vm->xe, tile, vm, SZ_4K,
110 				  ttm_bo_type_kernel,
111 				  XE_BO_CREATE_VRAM_IF_DGFX(tile) |
112 				  XE_BO_CREATE_IGNORE_MIN_PAGE_SIZE_BIT |
113 				  XE_BO_CREATE_PINNED_BIT |
114 				  XE_BO_CREATE_NO_RESV_EVICT |
115 				  XE_BO_PAGETABLE);
116 	if (IS_ERR(bo)) {
117 		err = PTR_ERR(bo);
118 		goto err_kfree;
119 	}
120 	pt->bo = bo;
121 	pt->base.children = level ? as_xe_pt_dir(pt)->children : NULL;
122 
123 	if (vm->xef)
124 		xe_drm_client_add_bo(vm->xef->client, pt->bo);
125 	xe_tile_assert(tile, level <= XE_VM_MAX_LEVEL);
126 
127 	return pt;
128 
129 err_kfree:
130 	xe_pt_free(pt);
131 	return ERR_PTR(err);
132 }
133 
134 /**
135  * xe_pt_populate_empty() - Populate a page-table bo with scratch- or zero
136  * entries.
137  * @tile: The tile the scratch pagetable of which to use.
138  * @vm: The vm we populate for.
139  * @pt: The pagetable the bo of which to initialize.
140  *
141  * Populate the page-table bo of @pt with entries pointing into the tile's
142  * scratch page-table tree if any. Otherwise populate with zeros.
143  */
144 void xe_pt_populate_empty(struct xe_tile *tile, struct xe_vm *vm,
145 			  struct xe_pt *pt)
146 {
147 	struct iosys_map *map = &pt->bo->vmap;
148 	u64 empty;
149 	int i;
150 
151 	if (!xe_vm_has_scratch(vm)) {
152 		/*
153 		 * FIXME: Some memory is allocated already allocated to zero?
154 		 * Find out which memory that is and avoid this memset...
155 		 */
156 		xe_map_memset(vm->xe, map, 0, 0, SZ_4K);
157 	} else {
158 		empty = __xe_pt_empty_pte(tile, vm, pt->level);
159 		for (i = 0; i < XE_PDES; i++)
160 			xe_pt_write(vm->xe, map, i, empty);
161 	}
162 }
163 
164 /**
165  * xe_pt_shift() - Return the ilog2 value of the size of the address range of
166  * a page-table at a certain level.
167  * @level: The level.
168  *
169  * Return: The ilog2 value of the size of the address range of a page-table
170  * at level @level.
171  */
172 unsigned int xe_pt_shift(unsigned int level)
173 {
174 	return XE_PTE_SHIFT + XE_PDE_SHIFT * level;
175 }
176 
177 /**
178  * xe_pt_destroy() - Destroy a page-table tree.
179  * @pt: The root of the page-table tree to destroy.
180  * @flags: vm flags. Currently unused.
181  * @deferred: List head of lockless list for deferred putting. NULL for
182  *            immediate putting.
183  *
184  * Puts the page-table bo, recursively calls xe_pt_destroy on all children
185  * and finally frees @pt. TODO: Can we remove the @flags argument?
186  */
187 void xe_pt_destroy(struct xe_pt *pt, u32 flags, struct llist_head *deferred)
188 {
189 	int i;
190 
191 	if (!pt)
192 		return;
193 
194 	XE_WARN_ON(!list_empty(&pt->bo->ttm.base.gpuva.list));
195 	xe_bo_unpin(pt->bo);
196 	xe_bo_put_deferred(pt->bo, deferred);
197 
198 	if (pt->level > 0 && pt->num_live) {
199 		struct xe_pt_dir *pt_dir = as_xe_pt_dir(pt);
200 
201 		for (i = 0; i < XE_PDES; i++) {
202 			if (xe_pt_entry(pt_dir, i))
203 				xe_pt_destroy(xe_pt_entry(pt_dir, i), flags,
204 					      deferred);
205 		}
206 	}
207 	xe_pt_free(pt);
208 }
209 
210 /**
211  * DOC: Pagetable building
212  *
213  * Below we use the term "page-table" for both page-directories, containing
214  * pointers to lower level page-directories or page-tables, and level 0
215  * page-tables that contain only page-table-entries pointing to memory pages.
216  *
217  * When inserting an address range in an already existing page-table tree
218  * there will typically be a set of page-tables that are shared with other
219  * address ranges, and a set that are private to this address range.
220  * The set of shared page-tables can be at most two per level,
221  * and those can't be updated immediately because the entries of those
222  * page-tables may still be in use by the gpu for other mappings. Therefore
223  * when inserting entries into those, we instead stage those insertions by
224  * adding insertion data into struct xe_vm_pgtable_update structures. This
225  * data, (subtrees for the cpu and page-table-entries for the gpu) is then
226  * added in a separate commit step. CPU-data is committed while still under the
227  * vm lock, the object lock and for userptr, the notifier lock in read mode.
228  * The GPU async data is committed either by the GPU or CPU after fulfilling
229  * relevant dependencies.
230  * For non-shared page-tables (and, in fact, for shared ones that aren't
231  * existing at the time of staging), we add the data in-place without the
232  * special update structures. This private part of the page-table tree will
233  * remain disconnected from the vm page-table tree until data is committed to
234  * the shared page tables of the vm tree in the commit phase.
235  */
236 
237 struct xe_pt_update {
238 	/** @update: The update structure we're building for this parent. */
239 	struct xe_vm_pgtable_update *update;
240 	/** @parent: The parent. Used to detect a parent change. */
241 	struct xe_pt *parent;
242 	/** @preexisting: Whether the parent was pre-existing or allocated */
243 	bool preexisting;
244 };
245 
246 struct xe_pt_stage_bind_walk {
247 	/** base: The base class. */
248 	struct xe_pt_walk base;
249 
250 	/* Input parameters for the walk */
251 	/** @vm: The vm we're building for. */
252 	struct xe_vm *vm;
253 	/** @tile: The tile we're building for. */
254 	struct xe_tile *tile;
255 	/** @default_pte: PTE flag only template. No address is associated */
256 	u64 default_pte;
257 	/** @dma_offset: DMA offset to add to the PTE. */
258 	u64 dma_offset;
259 	/**
260 	 * @needs_64k: This address range enforces 64K alignment and
261 	 * granularity.
262 	 */
263 	bool needs_64K;
264 	/**
265 	 * @vma: VMA being mapped
266 	 */
267 	struct xe_vma *vma;
268 
269 	/* Also input, but is updated during the walk*/
270 	/** @curs: The DMA address cursor. */
271 	struct xe_res_cursor *curs;
272 	/** @va_curs_start: The Virtual address coresponding to @curs->start */
273 	u64 va_curs_start;
274 
275 	/* Output */
276 	struct xe_walk_update {
277 		/** @wupd.entries: Caller provided storage. */
278 		struct xe_vm_pgtable_update *entries;
279 		/** @wupd.num_used_entries: Number of update @entries used. */
280 		unsigned int num_used_entries;
281 		/** @wupd.updates: Tracks the update entry at a given level */
282 		struct xe_pt_update updates[XE_VM_MAX_LEVEL + 1];
283 	} wupd;
284 
285 	/* Walk state */
286 	/**
287 	 * @l0_end_addr: The end address of the current l0 leaf. Used for
288 	 * 64K granularity detection.
289 	 */
290 	u64 l0_end_addr;
291 	/** @addr_64K: The start address of the current 64K chunk. */
292 	u64 addr_64K;
293 	/** @found_64: Whether @add_64K actually points to a 64K chunk. */
294 	bool found_64K;
295 };
296 
297 static int
298 xe_pt_new_shared(struct xe_walk_update *wupd, struct xe_pt *parent,
299 		 pgoff_t offset, bool alloc_entries)
300 {
301 	struct xe_pt_update *upd = &wupd->updates[parent->level];
302 	struct xe_vm_pgtable_update *entry;
303 
304 	/*
305 	 * For *each level*, we could only have one active
306 	 * struct xt_pt_update at any one time. Once we move on to a
307 	 * new parent and page-directory, the old one is complete, and
308 	 * updates are either already stored in the build tree or in
309 	 * @wupd->entries
310 	 */
311 	if (likely(upd->parent == parent))
312 		return 0;
313 
314 	upd->parent = parent;
315 	upd->preexisting = true;
316 
317 	if (wupd->num_used_entries == XE_VM_MAX_LEVEL * 2 + 1)
318 		return -EINVAL;
319 
320 	entry = wupd->entries + wupd->num_used_entries++;
321 	upd->update = entry;
322 	entry->ofs = offset;
323 	entry->pt_bo = parent->bo;
324 	entry->pt = parent;
325 	entry->flags = 0;
326 	entry->qwords = 0;
327 
328 	if (alloc_entries) {
329 		entry->pt_entries = kmalloc_array(XE_PDES,
330 						  sizeof(*entry->pt_entries),
331 						  GFP_KERNEL);
332 		if (!entry->pt_entries)
333 			return -ENOMEM;
334 	}
335 
336 	return 0;
337 }
338 
339 /*
340  * NOTE: This is a very frequently called function so we allow ourselves
341  * to annotate (using branch prediction hints) the fastpath of updating a
342  * non-pre-existing pagetable with leaf ptes.
343  */
344 static int
345 xe_pt_insert_entry(struct xe_pt_stage_bind_walk *xe_walk, struct xe_pt *parent,
346 		   pgoff_t offset, struct xe_pt *xe_child, u64 pte)
347 {
348 	struct xe_pt_update *upd = &xe_walk->wupd.updates[parent->level];
349 	struct xe_pt_update *child_upd = xe_child ?
350 		&xe_walk->wupd.updates[xe_child->level] : NULL;
351 	int ret;
352 
353 	ret = xe_pt_new_shared(&xe_walk->wupd, parent, offset, true);
354 	if (unlikely(ret))
355 		return ret;
356 
357 	/*
358 	 * Register this new pagetable so that it won't be recognized as
359 	 * a shared pagetable by a subsequent insertion.
360 	 */
361 	if (unlikely(child_upd)) {
362 		child_upd->update = NULL;
363 		child_upd->parent = xe_child;
364 		child_upd->preexisting = false;
365 	}
366 
367 	if (likely(!upd->preexisting)) {
368 		/* Continue building a non-connected subtree. */
369 		struct iosys_map *map = &parent->bo->vmap;
370 
371 		if (unlikely(xe_child))
372 			parent->base.children[offset] = &xe_child->base;
373 
374 		xe_pt_write(xe_walk->vm->xe, map, offset, pte);
375 		parent->num_live++;
376 	} else {
377 		/* Shared pt. Stage update. */
378 		unsigned int idx;
379 		struct xe_vm_pgtable_update *entry = upd->update;
380 
381 		idx = offset - entry->ofs;
382 		entry->pt_entries[idx].pt = xe_child;
383 		entry->pt_entries[idx].pte = pte;
384 		entry->qwords++;
385 	}
386 
387 	return 0;
388 }
389 
390 static bool xe_pt_hugepte_possible(u64 addr, u64 next, unsigned int level,
391 				   struct xe_pt_stage_bind_walk *xe_walk)
392 {
393 	u64 size, dma;
394 
395 	if (level > MAX_HUGEPTE_LEVEL)
396 		return false;
397 
398 	/* Does the virtual range requested cover a huge pte? */
399 	if (!xe_pt_covers(addr, next, level, &xe_walk->base))
400 		return false;
401 
402 	/* Does the DMA segment cover the whole pte? */
403 	if (next - xe_walk->va_curs_start > xe_walk->curs->size)
404 		return false;
405 
406 	/* null VMA's do not have dma addresses */
407 	if (xe_vma_is_null(xe_walk->vma))
408 		return true;
409 
410 	/* Is the DMA address huge PTE size aligned? */
411 	size = next - addr;
412 	dma = addr - xe_walk->va_curs_start + xe_res_dma(xe_walk->curs);
413 
414 	return IS_ALIGNED(dma, size);
415 }
416 
417 /*
418  * Scan the requested mapping to check whether it can be done entirely
419  * with 64K PTEs.
420  */
421 static bool
422 xe_pt_scan_64K(u64 addr, u64 next, struct xe_pt_stage_bind_walk *xe_walk)
423 {
424 	struct xe_res_cursor curs = *xe_walk->curs;
425 
426 	if (!IS_ALIGNED(addr, SZ_64K))
427 		return false;
428 
429 	if (next > xe_walk->l0_end_addr)
430 		return false;
431 
432 	/* null VMA's do not have dma addresses */
433 	if (xe_vma_is_null(xe_walk->vma))
434 		return true;
435 
436 	xe_res_next(&curs, addr - xe_walk->va_curs_start);
437 	for (; addr < next; addr += SZ_64K) {
438 		if (!IS_ALIGNED(xe_res_dma(&curs), SZ_64K) || curs.size < SZ_64K)
439 			return false;
440 
441 		xe_res_next(&curs, SZ_64K);
442 	}
443 
444 	return addr == next;
445 }
446 
447 /*
448  * For non-compact "normal" 4K level-0 pagetables, we want to try to group
449  * addresses together in 64K-contigous regions to add a 64K TLB hint for the
450  * device to the PTE.
451  * This function determines whether the address is part of such a
452  * segment. For VRAM in normal pagetables, this is strictly necessary on
453  * some devices.
454  */
455 static bool
456 xe_pt_is_pte_ps64K(u64 addr, u64 next, struct xe_pt_stage_bind_walk *xe_walk)
457 {
458 	/* Address is within an already found 64k region */
459 	if (xe_walk->found_64K && addr - xe_walk->addr_64K < SZ_64K)
460 		return true;
461 
462 	xe_walk->found_64K = xe_pt_scan_64K(addr, addr + SZ_64K, xe_walk);
463 	xe_walk->addr_64K = addr;
464 
465 	return xe_walk->found_64K;
466 }
467 
468 static int
469 xe_pt_stage_bind_entry(struct xe_ptw *parent, pgoff_t offset,
470 		       unsigned int level, u64 addr, u64 next,
471 		       struct xe_ptw **child,
472 		       enum page_walk_action *action,
473 		       struct xe_pt_walk *walk)
474 {
475 	struct xe_pt_stage_bind_walk *xe_walk =
476 		container_of(walk, typeof(*xe_walk), base);
477 	u16 pat_index = xe_walk->vma->pat_index;
478 	struct xe_pt *xe_parent = container_of(parent, typeof(*xe_parent), base);
479 	struct xe_vm *vm = xe_walk->vm;
480 	struct xe_pt *xe_child;
481 	bool covers;
482 	int ret = 0;
483 	u64 pte;
484 
485 	/* Is this a leaf entry ?*/
486 	if (level == 0 || xe_pt_hugepte_possible(addr, next, level, xe_walk)) {
487 		struct xe_res_cursor *curs = xe_walk->curs;
488 		bool is_null = xe_vma_is_null(xe_walk->vma);
489 
490 		XE_WARN_ON(xe_walk->va_curs_start != addr);
491 
492 		pte = vm->pt_ops->pte_encode_vma(is_null ? 0 :
493 						 xe_res_dma(curs) + xe_walk->dma_offset,
494 						 xe_walk->vma, pat_index, level);
495 		pte |= xe_walk->default_pte;
496 
497 		/*
498 		 * Set the XE_PTE_PS64 hint if possible, otherwise if
499 		 * this device *requires* 64K PTE size for VRAM, fail.
500 		 */
501 		if (level == 0 && !xe_parent->is_compact) {
502 			if (xe_pt_is_pte_ps64K(addr, next, xe_walk)) {
503 				xe_walk->vma->gpuva.flags |= XE_VMA_PTE_64K;
504 				pte |= XE_PTE_PS64;
505 			} else if (XE_WARN_ON(xe_walk->needs_64K)) {
506 				return -EINVAL;
507 			}
508 		}
509 
510 		ret = xe_pt_insert_entry(xe_walk, xe_parent, offset, NULL, pte);
511 		if (unlikely(ret))
512 			return ret;
513 
514 		if (!is_null)
515 			xe_res_next(curs, next - addr);
516 		xe_walk->va_curs_start = next;
517 		xe_walk->vma->gpuva.flags |= (XE_VMA_PTE_4K << level);
518 		*action = ACTION_CONTINUE;
519 
520 		return ret;
521 	}
522 
523 	/*
524 	 * Descending to lower level. Determine if we need to allocate a
525 	 * new page table or -directory, which we do if there is no
526 	 * previous one or there is one we can completely replace.
527 	 */
528 	if (level == 1) {
529 		walk->shifts = xe_normal_pt_shifts;
530 		xe_walk->l0_end_addr = next;
531 	}
532 
533 	covers = xe_pt_covers(addr, next, level, &xe_walk->base);
534 	if (covers || !*child) {
535 		u64 flags = 0;
536 
537 		xe_child = xe_pt_create(xe_walk->vm, xe_walk->tile, level - 1);
538 		if (IS_ERR(xe_child))
539 			return PTR_ERR(xe_child);
540 
541 		xe_pt_set_addr(xe_child,
542 			       round_down(addr, 1ull << walk->shifts[level]));
543 
544 		if (!covers)
545 			xe_pt_populate_empty(xe_walk->tile, xe_walk->vm, xe_child);
546 
547 		*child = &xe_child->base;
548 
549 		/*
550 		 * Prefer the compact pagetable layout for L0 if possible. Only
551 		 * possible if VMA covers entire 2MB region as compact 64k and
552 		 * 4k pages cannot be mixed within a 2MB region.
553 		 * TODO: Suballocate the pt bo to avoid wasting a lot of
554 		 * memory.
555 		 */
556 		if (GRAPHICS_VERx100(tile_to_xe(xe_walk->tile)) >= 1250 && level == 1 &&
557 		    covers && xe_pt_scan_64K(addr, next, xe_walk)) {
558 			walk->shifts = xe_compact_pt_shifts;
559 			xe_walk->vma->gpuva.flags |= XE_VMA_PTE_COMPACT;
560 			flags |= XE_PDE_64K;
561 			xe_child->is_compact = true;
562 		}
563 
564 		pte = vm->pt_ops->pde_encode_bo(xe_child->bo, 0, pat_index) | flags;
565 		ret = xe_pt_insert_entry(xe_walk, xe_parent, offset, xe_child,
566 					 pte);
567 	}
568 
569 	*action = ACTION_SUBTREE;
570 	return ret;
571 }
572 
573 static const struct xe_pt_walk_ops xe_pt_stage_bind_ops = {
574 	.pt_entry = xe_pt_stage_bind_entry,
575 };
576 
577 /**
578  * xe_pt_stage_bind() - Build a disconnected page-table tree for a given address
579  * range.
580  * @tile: The tile we're building for.
581  * @vma: The vma indicating the address range.
582  * @entries: Storage for the update entries used for connecting the tree to
583  * the main tree at commit time.
584  * @num_entries: On output contains the number of @entries used.
585  *
586  * This function builds a disconnected page-table tree for a given address
587  * range. The tree is connected to the main vm tree for the gpu using
588  * xe_migrate_update_pgtables() and for the cpu using xe_pt_commit_bind().
589  * The function builds xe_vm_pgtable_update structures for already existing
590  * shared page-tables, and non-existing shared and non-shared page-tables
591  * are built and populated directly.
592  *
593  * Return 0 on success, negative error code on error.
594  */
595 static int
596 xe_pt_stage_bind(struct xe_tile *tile, struct xe_vma *vma,
597 		 struct xe_vm_pgtable_update *entries, u32 *num_entries)
598 {
599 	struct xe_device *xe = tile_to_xe(tile);
600 	struct xe_bo *bo = xe_vma_bo(vma);
601 	bool is_devmem = !xe_vma_is_userptr(vma) && bo &&
602 		(xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo));
603 	struct xe_res_cursor curs;
604 	struct xe_pt_stage_bind_walk xe_walk = {
605 		.base = {
606 			.ops = &xe_pt_stage_bind_ops,
607 			.shifts = xe_normal_pt_shifts,
608 			.max_level = XE_PT_HIGHEST_LEVEL,
609 		},
610 		.vm = xe_vma_vm(vma),
611 		.tile = tile,
612 		.curs = &curs,
613 		.va_curs_start = xe_vma_start(vma),
614 		.vma = vma,
615 		.wupd.entries = entries,
616 		.needs_64K = (xe_vma_vm(vma)->flags & XE_VM_FLAG_64K) && is_devmem,
617 	};
618 	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
619 	int ret;
620 
621 	if (vma && (vma->gpuva.flags & XE_VMA_ATOMIC_PTE_BIT) &&
622 	    (is_devmem || !IS_DGFX(xe)))
623 		xe_walk.default_pte |= XE_USM_PPGTT_PTE_AE;
624 
625 	if (is_devmem) {
626 		xe_walk.default_pte |= XE_PPGTT_PTE_DM;
627 		xe_walk.dma_offset = vram_region_gpu_offset(bo->ttm.resource);
628 	}
629 
630 	if (!xe_vma_has_no_bo(vma) && xe_bo_is_stolen(bo))
631 		xe_walk.dma_offset = xe_ttm_stolen_gpu_offset(xe_bo_device(bo));
632 
633 	xe_bo_assert_held(bo);
634 
635 	if (!xe_vma_is_null(vma)) {
636 		if (xe_vma_is_userptr(vma))
637 			xe_res_first_sg(to_userptr_vma(vma)->userptr.sg, 0,
638 					xe_vma_size(vma), &curs);
639 		else if (xe_bo_is_vram(bo) || xe_bo_is_stolen(bo))
640 			xe_res_first(bo->ttm.resource, xe_vma_bo_offset(vma),
641 				     xe_vma_size(vma), &curs);
642 		else
643 			xe_res_first_sg(xe_bo_sg(bo), xe_vma_bo_offset(vma),
644 					xe_vma_size(vma), &curs);
645 	} else {
646 		curs.size = xe_vma_size(vma);
647 	}
648 
649 	ret = xe_pt_walk_range(&pt->base, pt->level, xe_vma_start(vma),
650 			       xe_vma_end(vma), &xe_walk.base);
651 
652 	*num_entries = xe_walk.wupd.num_used_entries;
653 	return ret;
654 }
655 
656 /**
657  * xe_pt_nonshared_offsets() - Determine the non-shared entry offsets of a
658  * shared pagetable.
659  * @addr: The start address within the non-shared pagetable.
660  * @end: The end address within the non-shared pagetable.
661  * @level: The level of the non-shared pagetable.
662  * @walk: Walk info. The function adjusts the walk action.
663  * @action: next action to perform (see enum page_walk_action)
664  * @offset: Ignored on input, First non-shared entry on output.
665  * @end_offset: Ignored on input, Last non-shared entry + 1 on output.
666  *
667  * A non-shared page-table has some entries that belong to the address range
668  * and others that don't. This function determines the entries that belong
669  * fully to the address range. Depending on level, some entries may
670  * partially belong to the address range (that can't happen at level 0).
671  * The function detects that and adjust those offsets to not include those
672  * partial entries. Iff it does detect partial entries, we know that there must
673  * be shared page tables also at lower levels, so it adjusts the walk action
674  * accordingly.
675  *
676  * Return: true if there were non-shared entries, false otherwise.
677  */
678 static bool xe_pt_nonshared_offsets(u64 addr, u64 end, unsigned int level,
679 				    struct xe_pt_walk *walk,
680 				    enum page_walk_action *action,
681 				    pgoff_t *offset, pgoff_t *end_offset)
682 {
683 	u64 size = 1ull << walk->shifts[level];
684 
685 	*offset = xe_pt_offset(addr, level, walk);
686 	*end_offset = xe_pt_num_entries(addr, end, level, walk) + *offset;
687 
688 	if (!level)
689 		return true;
690 
691 	/*
692 	 * If addr or next are not size aligned, there are shared pts at lower
693 	 * level, so in that case traverse down the subtree
694 	 */
695 	*action = ACTION_CONTINUE;
696 	if (!IS_ALIGNED(addr, size)) {
697 		*action = ACTION_SUBTREE;
698 		(*offset)++;
699 	}
700 
701 	if (!IS_ALIGNED(end, size)) {
702 		*action = ACTION_SUBTREE;
703 		(*end_offset)--;
704 	}
705 
706 	return *end_offset > *offset;
707 }
708 
709 struct xe_pt_zap_ptes_walk {
710 	/** @base: The walk base-class */
711 	struct xe_pt_walk base;
712 
713 	/* Input parameters for the walk */
714 	/** @tile: The tile we're building for */
715 	struct xe_tile *tile;
716 
717 	/* Output */
718 	/** @needs_invalidate: Whether we need to invalidate TLB*/
719 	bool needs_invalidate;
720 };
721 
722 static int xe_pt_zap_ptes_entry(struct xe_ptw *parent, pgoff_t offset,
723 				unsigned int level, u64 addr, u64 next,
724 				struct xe_ptw **child,
725 				enum page_walk_action *action,
726 				struct xe_pt_walk *walk)
727 {
728 	struct xe_pt_zap_ptes_walk *xe_walk =
729 		container_of(walk, typeof(*xe_walk), base);
730 	struct xe_pt *xe_child = container_of(*child, typeof(*xe_child), base);
731 	pgoff_t end_offset;
732 
733 	XE_WARN_ON(!*child);
734 	XE_WARN_ON(!level && xe_child->is_compact);
735 
736 	/*
737 	 * Note that we're called from an entry callback, and we're dealing
738 	 * with the child of that entry rather than the parent, so need to
739 	 * adjust level down.
740 	 */
741 	if (xe_pt_nonshared_offsets(addr, next, --level, walk, action, &offset,
742 				    &end_offset)) {
743 		xe_map_memset(tile_to_xe(xe_walk->tile), &xe_child->bo->vmap,
744 			      offset * sizeof(u64), 0,
745 			      (end_offset - offset) * sizeof(u64));
746 		xe_walk->needs_invalidate = true;
747 	}
748 
749 	return 0;
750 }
751 
752 static const struct xe_pt_walk_ops xe_pt_zap_ptes_ops = {
753 	.pt_entry = xe_pt_zap_ptes_entry,
754 };
755 
756 /**
757  * xe_pt_zap_ptes() - Zap (zero) gpu ptes of an address range
758  * @tile: The tile we're zapping for.
759  * @vma: GPU VMA detailing address range.
760  *
761  * Eviction and Userptr invalidation needs to be able to zap the
762  * gpu ptes of a given address range in pagefaulting mode.
763  * In order to be able to do that, that function needs access to the shared
764  * page-table entrieaso it can either clear the leaf PTEs or
765  * clear the pointers to lower-level page-tables. The caller is required
766  * to hold the necessary locks to ensure neither the page-table connectivity
767  * nor the page-table entries of the range is updated from under us.
768  *
769  * Return: Whether ptes were actually updated and a TLB invalidation is
770  * required.
771  */
772 bool xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma)
773 {
774 	struct xe_pt_zap_ptes_walk xe_walk = {
775 		.base = {
776 			.ops = &xe_pt_zap_ptes_ops,
777 			.shifts = xe_normal_pt_shifts,
778 			.max_level = XE_PT_HIGHEST_LEVEL,
779 		},
780 		.tile = tile,
781 	};
782 	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
783 
784 	if (!(vma->tile_present & BIT(tile->id)))
785 		return false;
786 
787 	(void)xe_pt_walk_shared(&pt->base, pt->level, xe_vma_start(vma),
788 				xe_vma_end(vma), &xe_walk.base);
789 
790 	return xe_walk.needs_invalidate;
791 }
792 
793 static void
794 xe_vm_populate_pgtable(struct xe_migrate_pt_update *pt_update, struct xe_tile *tile,
795 		       struct iosys_map *map, void *data,
796 		       u32 qword_ofs, u32 num_qwords,
797 		       const struct xe_vm_pgtable_update *update)
798 {
799 	struct xe_pt_entry *ptes = update->pt_entries;
800 	u64 *ptr = data;
801 	u32 i;
802 
803 	for (i = 0; i < num_qwords; i++) {
804 		if (map)
805 			xe_map_wr(tile_to_xe(tile), map, (qword_ofs + i) *
806 				  sizeof(u64), u64, ptes[i].pte);
807 		else
808 			ptr[i] = ptes[i].pte;
809 	}
810 }
811 
812 static void xe_pt_abort_bind(struct xe_vma *vma,
813 			     struct xe_vm_pgtable_update *entries,
814 			     u32 num_entries)
815 {
816 	u32 i, j;
817 
818 	for (i = 0; i < num_entries; i++) {
819 		if (!entries[i].pt_entries)
820 			continue;
821 
822 		for (j = 0; j < entries[i].qwords; j++)
823 			xe_pt_destroy(entries[i].pt_entries[j].pt, xe_vma_vm(vma)->flags, NULL);
824 		kfree(entries[i].pt_entries);
825 	}
826 }
827 
828 static void xe_pt_commit_locks_assert(struct xe_vma *vma)
829 {
830 	struct xe_vm *vm = xe_vma_vm(vma);
831 
832 	lockdep_assert_held(&vm->lock);
833 
834 	if (xe_vma_is_userptr(vma))
835 		lockdep_assert_held_read(&vm->userptr.notifier_lock);
836 	else if (!xe_vma_is_null(vma))
837 		dma_resv_assert_held(xe_vma_bo(vma)->ttm.base.resv);
838 
839 	xe_vm_assert_held(vm);
840 }
841 
842 static void xe_pt_commit_bind(struct xe_vma *vma,
843 			      struct xe_vm_pgtable_update *entries,
844 			      u32 num_entries, bool rebind,
845 			      struct llist_head *deferred)
846 {
847 	u32 i, j;
848 
849 	xe_pt_commit_locks_assert(vma);
850 
851 	for (i = 0; i < num_entries; i++) {
852 		struct xe_pt *pt = entries[i].pt;
853 		struct xe_pt_dir *pt_dir;
854 
855 		if (!rebind)
856 			pt->num_live += entries[i].qwords;
857 
858 		if (!pt->level) {
859 			kfree(entries[i].pt_entries);
860 			continue;
861 		}
862 
863 		pt_dir = as_xe_pt_dir(pt);
864 		for (j = 0; j < entries[i].qwords; j++) {
865 			u32 j_ = j + entries[i].ofs;
866 			struct xe_pt *newpte = entries[i].pt_entries[j].pt;
867 
868 			if (xe_pt_entry(pt_dir, j_))
869 				xe_pt_destroy(xe_pt_entry(pt_dir, j_),
870 					      xe_vma_vm(vma)->flags, deferred);
871 
872 			pt_dir->children[j_] = &newpte->base;
873 		}
874 		kfree(entries[i].pt_entries);
875 	}
876 }
877 
878 static int
879 xe_pt_prepare_bind(struct xe_tile *tile, struct xe_vma *vma,
880 		   struct xe_vm_pgtable_update *entries, u32 *num_entries,
881 		   bool rebind)
882 {
883 	int err;
884 
885 	*num_entries = 0;
886 	err = xe_pt_stage_bind(tile, vma, entries, num_entries);
887 	if (!err)
888 		xe_tile_assert(tile, *num_entries);
889 	else /* abort! */
890 		xe_pt_abort_bind(vma, entries, *num_entries);
891 
892 	return err;
893 }
894 
895 static void xe_vm_dbg_print_entries(struct xe_device *xe,
896 				    const struct xe_vm_pgtable_update *entries,
897 				    unsigned int num_entries)
898 #if (IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM))
899 {
900 	unsigned int i;
901 
902 	vm_dbg(&xe->drm, "%u entries to update\n", num_entries);
903 	for (i = 0; i < num_entries; i++) {
904 		const struct xe_vm_pgtable_update *entry = &entries[i];
905 		struct xe_pt *xe_pt = entry->pt;
906 		u64 page_size = 1ull << xe_pt_shift(xe_pt->level);
907 		u64 end;
908 		u64 start;
909 
910 		xe_assert(xe, !entry->pt->is_compact);
911 		start = entry->ofs * page_size;
912 		end = start + page_size * entry->qwords;
913 		vm_dbg(&xe->drm,
914 		       "\t%u: Update level %u at (%u + %u) [%llx...%llx) f:%x\n",
915 		       i, xe_pt->level, entry->ofs, entry->qwords,
916 		       xe_pt_addr(xe_pt) + start, xe_pt_addr(xe_pt) + end, 0);
917 	}
918 }
919 #else
920 {}
921 #endif
922 
923 #ifdef CONFIG_DRM_XE_USERPTR_INVAL_INJECT
924 
925 static int xe_pt_userptr_inject_eagain(struct xe_userptr_vma *uvma)
926 {
927 	u32 divisor = uvma->userptr.divisor ? uvma->userptr.divisor : 2;
928 	static u32 count;
929 
930 	if (count++ % divisor == divisor - 1) {
931 		struct xe_vm *vm = xe_vma_vm(&uvma->vma);
932 
933 		uvma->userptr.divisor = divisor << 1;
934 		spin_lock(&vm->userptr.invalidated_lock);
935 		list_move_tail(&uvma->userptr.invalidate_link,
936 			       &vm->userptr.invalidated);
937 		spin_unlock(&vm->userptr.invalidated_lock);
938 		return true;
939 	}
940 
941 	return false;
942 }
943 
944 #else
945 
946 static bool xe_pt_userptr_inject_eagain(struct xe_userptr_vma *uvma)
947 {
948 	return false;
949 }
950 
951 #endif
952 
953 /**
954  * struct xe_pt_migrate_pt_update - Callback argument for pre-commit callbacks
955  * @base: Base we derive from.
956  * @bind: Whether this is a bind or an unbind operation. A bind operation
957  *        makes the pre-commit callback error with -EAGAIN if it detects a
958  *        pending invalidation.
959  * @locked: Whether the pre-commit callback locked the userptr notifier lock
960  *          and it needs unlocking.
961  */
962 struct xe_pt_migrate_pt_update {
963 	struct xe_migrate_pt_update base;
964 	bool bind;
965 	bool locked;
966 };
967 
968 /*
969  * This function adds the needed dependencies to a page-table update job
970  * to make sure racing jobs for separate bind engines don't race writing
971  * to the same page-table range, wreaking havoc. Initially use a single
972  * fence for the entire VM. An optimization would use smaller granularity.
973  */
974 static int xe_pt_vm_dependencies(struct xe_sched_job *job,
975 				 struct xe_range_fence_tree *rftree,
976 				 u64 start, u64 last)
977 {
978 	struct xe_range_fence *rtfence;
979 	struct dma_fence *fence;
980 	int err;
981 
982 	rtfence = xe_range_fence_tree_first(rftree, start, last);
983 	while (rtfence) {
984 		fence = rtfence->fence;
985 
986 		if (!dma_fence_is_signaled(fence)) {
987 			/*
988 			 * Is this a CPU update? GPU is busy updating, so return
989 			 * an error
990 			 */
991 			if (!job)
992 				return -ETIME;
993 
994 			dma_fence_get(fence);
995 			err = drm_sched_job_add_dependency(&job->drm, fence);
996 			if (err)
997 				return err;
998 		}
999 
1000 		rtfence = xe_range_fence_tree_next(rtfence, start, last);
1001 	}
1002 
1003 	return 0;
1004 }
1005 
1006 static int xe_pt_pre_commit(struct xe_migrate_pt_update *pt_update)
1007 {
1008 	struct xe_range_fence_tree *rftree =
1009 		&xe_vma_vm(pt_update->vma)->rftree[pt_update->tile_id];
1010 
1011 	return xe_pt_vm_dependencies(pt_update->job, rftree,
1012 				     pt_update->start, pt_update->last);
1013 }
1014 
1015 static int xe_pt_userptr_pre_commit(struct xe_migrate_pt_update *pt_update)
1016 {
1017 	struct xe_pt_migrate_pt_update *userptr_update =
1018 		container_of(pt_update, typeof(*userptr_update), base);
1019 	struct xe_userptr_vma *uvma = to_userptr_vma(pt_update->vma);
1020 	unsigned long notifier_seq = uvma->userptr.notifier_seq;
1021 	struct xe_vm *vm = xe_vma_vm(&uvma->vma);
1022 	int err = xe_pt_vm_dependencies(pt_update->job,
1023 					&vm->rftree[pt_update->tile_id],
1024 					pt_update->start,
1025 					pt_update->last);
1026 
1027 	if (err)
1028 		return err;
1029 
1030 	userptr_update->locked = false;
1031 
1032 	/*
1033 	 * Wait until nobody is running the invalidation notifier, and
1034 	 * since we're exiting the loop holding the notifier lock,
1035 	 * nobody can proceed invalidating either.
1036 	 *
1037 	 * Note that we don't update the vma->userptr.notifier_seq since
1038 	 * we don't update the userptr pages.
1039 	 */
1040 	do {
1041 		down_read(&vm->userptr.notifier_lock);
1042 		if (!mmu_interval_read_retry(&uvma->userptr.notifier,
1043 					     notifier_seq))
1044 			break;
1045 
1046 		up_read(&vm->userptr.notifier_lock);
1047 
1048 		if (userptr_update->bind)
1049 			return -EAGAIN;
1050 
1051 		notifier_seq = mmu_interval_read_begin(&uvma->userptr.notifier);
1052 	} while (true);
1053 
1054 	/* Inject errors to test_whether they are handled correctly */
1055 	if (userptr_update->bind && xe_pt_userptr_inject_eagain(uvma)) {
1056 		up_read(&vm->userptr.notifier_lock);
1057 		return -EAGAIN;
1058 	}
1059 
1060 	userptr_update->locked = true;
1061 
1062 	return 0;
1063 }
1064 
1065 static const struct xe_migrate_pt_update_ops bind_ops = {
1066 	.populate = xe_vm_populate_pgtable,
1067 	.pre_commit = xe_pt_pre_commit,
1068 };
1069 
1070 static const struct xe_migrate_pt_update_ops userptr_bind_ops = {
1071 	.populate = xe_vm_populate_pgtable,
1072 	.pre_commit = xe_pt_userptr_pre_commit,
1073 };
1074 
1075 struct invalidation_fence {
1076 	struct xe_gt_tlb_invalidation_fence base;
1077 	struct xe_gt *gt;
1078 	struct xe_vma *vma;
1079 	struct dma_fence *fence;
1080 	struct dma_fence_cb cb;
1081 	struct work_struct work;
1082 };
1083 
1084 static const char *
1085 invalidation_fence_get_driver_name(struct dma_fence *dma_fence)
1086 {
1087 	return "xe";
1088 }
1089 
1090 static const char *
1091 invalidation_fence_get_timeline_name(struct dma_fence *dma_fence)
1092 {
1093 	return "invalidation_fence";
1094 }
1095 
1096 static const struct dma_fence_ops invalidation_fence_ops = {
1097 	.get_driver_name = invalidation_fence_get_driver_name,
1098 	.get_timeline_name = invalidation_fence_get_timeline_name,
1099 };
1100 
1101 static void invalidation_fence_cb(struct dma_fence *fence,
1102 				  struct dma_fence_cb *cb)
1103 {
1104 	struct invalidation_fence *ifence =
1105 		container_of(cb, struct invalidation_fence, cb);
1106 
1107 	trace_xe_gt_tlb_invalidation_fence_cb(&ifence->base);
1108 	if (!ifence->fence->error) {
1109 		queue_work(system_wq, &ifence->work);
1110 	} else {
1111 		ifence->base.base.error = ifence->fence->error;
1112 		dma_fence_signal(&ifence->base.base);
1113 		dma_fence_put(&ifence->base.base);
1114 	}
1115 	dma_fence_put(ifence->fence);
1116 }
1117 
1118 static void invalidation_fence_work_func(struct work_struct *w)
1119 {
1120 	struct invalidation_fence *ifence =
1121 		container_of(w, struct invalidation_fence, work);
1122 
1123 	trace_xe_gt_tlb_invalidation_fence_work_func(&ifence->base);
1124 	xe_gt_tlb_invalidation_vma(ifence->gt, &ifence->base, ifence->vma);
1125 }
1126 
1127 static int invalidation_fence_init(struct xe_gt *gt,
1128 				   struct invalidation_fence *ifence,
1129 				   struct dma_fence *fence,
1130 				   struct xe_vma *vma)
1131 {
1132 	int ret;
1133 
1134 	trace_xe_gt_tlb_invalidation_fence_create(&ifence->base);
1135 
1136 	spin_lock_irq(&gt->tlb_invalidation.lock);
1137 	dma_fence_init(&ifence->base.base, &invalidation_fence_ops,
1138 		       &gt->tlb_invalidation.lock,
1139 		       gt->tlb_invalidation.fence_context,
1140 		       ++gt->tlb_invalidation.fence_seqno);
1141 	spin_unlock_irq(&gt->tlb_invalidation.lock);
1142 
1143 	INIT_LIST_HEAD(&ifence->base.link);
1144 
1145 	dma_fence_get(&ifence->base.base);	/* Ref for caller */
1146 	ifence->fence = fence;
1147 	ifence->gt = gt;
1148 	ifence->vma = vma;
1149 
1150 	INIT_WORK(&ifence->work, invalidation_fence_work_func);
1151 	ret = dma_fence_add_callback(fence, &ifence->cb, invalidation_fence_cb);
1152 	if (ret == -ENOENT) {
1153 		dma_fence_put(ifence->fence);	/* Usually dropped in CB */
1154 		invalidation_fence_work_func(&ifence->work);
1155 	} else if (ret) {
1156 		dma_fence_put(&ifence->base.base);	/* Caller ref */
1157 		dma_fence_put(&ifence->base.base);	/* Creation ref */
1158 	}
1159 
1160 	xe_gt_assert(gt, !ret || ret == -ENOENT);
1161 
1162 	return ret && ret != -ENOENT ? ret : 0;
1163 }
1164 
1165 static void xe_pt_calc_rfence_interval(struct xe_vma *vma,
1166 				       struct xe_pt_migrate_pt_update *update,
1167 				       struct xe_vm_pgtable_update *entries,
1168 				       u32 num_entries)
1169 {
1170 	int i, level = 0;
1171 
1172 	for (i = 0; i < num_entries; i++) {
1173 		const struct xe_vm_pgtable_update *entry = &entries[i];
1174 
1175 		if (entry->pt->level > level)
1176 			level = entry->pt->level;
1177 	}
1178 
1179 	/* Greedy (non-optimal) calculation but simple */
1180 	update->base.start = ALIGN_DOWN(xe_vma_start(vma),
1181 					0x1ull << xe_pt_shift(level));
1182 	update->base.last = ALIGN(xe_vma_end(vma),
1183 				  0x1ull << xe_pt_shift(level)) - 1;
1184 }
1185 
1186 /**
1187  * __xe_pt_bind_vma() - Build and connect a page-table tree for the vma
1188  * address range.
1189  * @tile: The tile to bind for.
1190  * @vma: The vma to bind.
1191  * @q: The exec_queue with which to do pipelined page-table updates.
1192  * @syncs: Entries to sync on before binding the built tree to the live vm tree.
1193  * @num_syncs: Number of @sync entries.
1194  * @rebind: Whether we're rebinding this vma to the same address range without
1195  * an unbind in-between.
1196  *
1197  * This function builds a page-table tree (see xe_pt_stage_bind() for more
1198  * information on page-table building), and the xe_vm_pgtable_update entries
1199  * abstracting the operations needed to attach it to the main vm tree. It
1200  * then takes the relevant locks and updates the metadata side of the main
1201  * vm tree and submits the operations for pipelined attachment of the
1202  * gpu page-table to the vm main tree, (which can be done either by the
1203  * cpu and the GPU).
1204  *
1205  * Return: A valid dma-fence representing the pipelined attachment operation
1206  * on success, an error pointer on error.
1207  */
1208 struct dma_fence *
1209 __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue *q,
1210 		 struct xe_sync_entry *syncs, u32 num_syncs,
1211 		 bool rebind)
1212 {
1213 	struct xe_vm_pgtable_update entries[XE_VM_MAX_LEVEL * 2 + 1];
1214 	struct xe_pt_migrate_pt_update bind_pt_update = {
1215 		.base = {
1216 			.ops = xe_vma_is_userptr(vma) ? &userptr_bind_ops : &bind_ops,
1217 			.vma = vma,
1218 			.tile_id = tile->id,
1219 		},
1220 		.bind = true,
1221 	};
1222 	struct xe_vm *vm = xe_vma_vm(vma);
1223 	u32 num_entries;
1224 	struct dma_fence *fence;
1225 	struct invalidation_fence *ifence = NULL;
1226 	struct xe_range_fence *rfence;
1227 	int err;
1228 
1229 	bind_pt_update.locked = false;
1230 	xe_bo_assert_held(xe_vma_bo(vma));
1231 	xe_vm_assert_held(vm);
1232 
1233 	vm_dbg(&xe_vma_vm(vma)->xe->drm,
1234 	       "Preparing bind, with range [%llx...%llx) engine %p.\n",
1235 	       xe_vma_start(vma), xe_vma_end(vma), q);
1236 
1237 	err = xe_pt_prepare_bind(tile, vma, entries, &num_entries, rebind);
1238 	if (err)
1239 		goto err;
1240 	xe_tile_assert(tile, num_entries <= ARRAY_SIZE(entries));
1241 
1242 	xe_vm_dbg_print_entries(tile_to_xe(tile), entries, num_entries);
1243 	xe_pt_calc_rfence_interval(vma, &bind_pt_update, entries,
1244 				   num_entries);
1245 
1246 	/*
1247 	 * If rebind, we have to invalidate TLB on !LR vms to invalidate
1248 	 * cached PTEs point to freed memory. on LR vms this is done
1249 	 * automatically when the context is re-enabled by the rebind worker,
1250 	 * or in fault mode it was invalidated on PTE zapping.
1251 	 *
1252 	 * If !rebind, and scratch enabled VMs, there is a chance the scratch
1253 	 * PTE is already cached in the TLB so it needs to be invalidated.
1254 	 * on !LR VMs this is done in the ring ops preceding a batch, but on
1255 	 * non-faulting LR, in particular on user-space batch buffer chaining,
1256 	 * it needs to be done here.
1257 	 */
1258 	if ((rebind && !xe_vm_in_lr_mode(vm) && !vm->batch_invalidate_tlb) ||
1259 	    (!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) {
1260 		ifence = kzalloc(sizeof(*ifence), GFP_KERNEL);
1261 		if (!ifence)
1262 			return ERR_PTR(-ENOMEM);
1263 	}
1264 
1265 	rfence = kzalloc(sizeof(*rfence), GFP_KERNEL);
1266 	if (!rfence) {
1267 		kfree(ifence);
1268 		return ERR_PTR(-ENOMEM);
1269 	}
1270 
1271 	fence = xe_migrate_update_pgtables(tile->migrate,
1272 					   vm, xe_vma_bo(vma), q,
1273 					   entries, num_entries,
1274 					   syncs, num_syncs,
1275 					   &bind_pt_update.base);
1276 	if (!IS_ERR(fence)) {
1277 		bool last_munmap_rebind = vma->gpuva.flags & XE_VMA_LAST_REBIND;
1278 		LLIST_HEAD(deferred);
1279 		int err;
1280 
1281 		err = xe_range_fence_insert(&vm->rftree[tile->id], rfence,
1282 					    &xe_range_fence_kfree_ops,
1283 					    bind_pt_update.base.start,
1284 					    bind_pt_update.base.last, fence);
1285 		if (err)
1286 			dma_fence_wait(fence, false);
1287 
1288 		/* TLB invalidation must be done before signaling rebind */
1289 		if (ifence) {
1290 			int err = invalidation_fence_init(tile->primary_gt, ifence, fence,
1291 							  vma);
1292 			if (err) {
1293 				dma_fence_put(fence);
1294 				kfree(ifence);
1295 				return ERR_PTR(err);
1296 			}
1297 			fence = &ifence->base.base;
1298 		}
1299 
1300 		/* add shared fence now for pagetable delayed destroy */
1301 		dma_resv_add_fence(xe_vm_resv(vm), fence, !rebind &&
1302 				   last_munmap_rebind ?
1303 				   DMA_RESV_USAGE_KERNEL :
1304 				   DMA_RESV_USAGE_BOOKKEEP);
1305 
1306 		if (!xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm)
1307 			dma_resv_add_fence(xe_vma_bo(vma)->ttm.base.resv, fence,
1308 					   DMA_RESV_USAGE_BOOKKEEP);
1309 		xe_pt_commit_bind(vma, entries, num_entries, rebind,
1310 				  bind_pt_update.locked ? &deferred : NULL);
1311 
1312 		/* This vma is live (again?) now */
1313 		vma->tile_present |= BIT(tile->id);
1314 
1315 		if (bind_pt_update.locked) {
1316 			to_userptr_vma(vma)->userptr.initial_bind = true;
1317 			up_read(&vm->userptr.notifier_lock);
1318 			xe_bo_put_commit(&deferred);
1319 		}
1320 		if (!rebind && last_munmap_rebind &&
1321 		    xe_vm_in_preempt_fence_mode(vm))
1322 			xe_vm_queue_rebind_worker(vm);
1323 	} else {
1324 		kfree(rfence);
1325 		kfree(ifence);
1326 		if (bind_pt_update.locked)
1327 			up_read(&vm->userptr.notifier_lock);
1328 		xe_pt_abort_bind(vma, entries, num_entries);
1329 	}
1330 
1331 	return fence;
1332 
1333 err:
1334 	return ERR_PTR(err);
1335 }
1336 
1337 struct xe_pt_stage_unbind_walk {
1338 	/** @base: The pagewalk base-class. */
1339 	struct xe_pt_walk base;
1340 
1341 	/* Input parameters for the walk */
1342 	/** @tile: The tile we're unbinding from. */
1343 	struct xe_tile *tile;
1344 
1345 	/**
1346 	 * @modified_start: Walk range start, modified to include any
1347 	 * shared pagetables that we're the only user of and can thus
1348 	 * treat as private.
1349 	 */
1350 	u64 modified_start;
1351 	/** @modified_end: Walk range start, modified like @modified_start. */
1352 	u64 modified_end;
1353 
1354 	/* Output */
1355 	/* @wupd: Structure to track the page-table updates we're building */
1356 	struct xe_walk_update wupd;
1357 };
1358 
1359 /*
1360  * Check whether this range is the only one populating this pagetable,
1361  * and in that case, update the walk range checks so that higher levels don't
1362  * view us as a shared pagetable.
1363  */
1364 static bool xe_pt_check_kill(u64 addr, u64 next, unsigned int level,
1365 			     const struct xe_pt *child,
1366 			     enum page_walk_action *action,
1367 			     struct xe_pt_walk *walk)
1368 {
1369 	struct xe_pt_stage_unbind_walk *xe_walk =
1370 		container_of(walk, typeof(*xe_walk), base);
1371 	unsigned int shift = walk->shifts[level];
1372 	u64 size = 1ull << shift;
1373 
1374 	if (IS_ALIGNED(addr, size) && IS_ALIGNED(next, size) &&
1375 	    ((next - addr) >> shift) == child->num_live) {
1376 		u64 size = 1ull << walk->shifts[level + 1];
1377 
1378 		*action = ACTION_CONTINUE;
1379 
1380 		if (xe_walk->modified_start >= addr)
1381 			xe_walk->modified_start = round_down(addr, size);
1382 		if (xe_walk->modified_end <= next)
1383 			xe_walk->modified_end = round_up(next, size);
1384 
1385 		return true;
1386 	}
1387 
1388 	return false;
1389 }
1390 
1391 static int xe_pt_stage_unbind_entry(struct xe_ptw *parent, pgoff_t offset,
1392 				    unsigned int level, u64 addr, u64 next,
1393 				    struct xe_ptw **child,
1394 				    enum page_walk_action *action,
1395 				    struct xe_pt_walk *walk)
1396 {
1397 	struct xe_pt *xe_child = container_of(*child, typeof(*xe_child), base);
1398 
1399 	XE_WARN_ON(!*child);
1400 	XE_WARN_ON(!level && xe_child->is_compact);
1401 
1402 	xe_pt_check_kill(addr, next, level - 1, xe_child, action, walk);
1403 
1404 	return 0;
1405 }
1406 
1407 static int
1408 xe_pt_stage_unbind_post_descend(struct xe_ptw *parent, pgoff_t offset,
1409 				unsigned int level, u64 addr, u64 next,
1410 				struct xe_ptw **child,
1411 				enum page_walk_action *action,
1412 				struct xe_pt_walk *walk)
1413 {
1414 	struct xe_pt_stage_unbind_walk *xe_walk =
1415 		container_of(walk, typeof(*xe_walk), base);
1416 	struct xe_pt *xe_child = container_of(*child, typeof(*xe_child), base);
1417 	pgoff_t end_offset;
1418 	u64 size = 1ull << walk->shifts[--level];
1419 
1420 	if (!IS_ALIGNED(addr, size))
1421 		addr = xe_walk->modified_start;
1422 	if (!IS_ALIGNED(next, size))
1423 		next = xe_walk->modified_end;
1424 
1425 	/* Parent == *child is the root pt. Don't kill it. */
1426 	if (parent != *child &&
1427 	    xe_pt_check_kill(addr, next, level, xe_child, action, walk))
1428 		return 0;
1429 
1430 	if (!xe_pt_nonshared_offsets(addr, next, level, walk, action, &offset,
1431 				     &end_offset))
1432 		return 0;
1433 
1434 	(void)xe_pt_new_shared(&xe_walk->wupd, xe_child, offset, false);
1435 	xe_walk->wupd.updates[level].update->qwords = end_offset - offset;
1436 
1437 	return 0;
1438 }
1439 
1440 static const struct xe_pt_walk_ops xe_pt_stage_unbind_ops = {
1441 	.pt_entry = xe_pt_stage_unbind_entry,
1442 	.pt_post_descend = xe_pt_stage_unbind_post_descend,
1443 };
1444 
1445 /**
1446  * xe_pt_stage_unbind() - Build page-table update structures for an unbind
1447  * operation
1448  * @tile: The tile we're unbinding for.
1449  * @vma: The vma we're unbinding.
1450  * @entries: Caller-provided storage for the update structures.
1451  *
1452  * Builds page-table update structures for an unbind operation. The function
1453  * will attempt to remove all page-tables that we're the only user
1454  * of, and for that to work, the unbind operation must be committed in the
1455  * same critical section that blocks racing binds to the same page-table tree.
1456  *
1457  * Return: The number of entries used.
1458  */
1459 static unsigned int xe_pt_stage_unbind(struct xe_tile *tile, struct xe_vma *vma,
1460 				       struct xe_vm_pgtable_update *entries)
1461 {
1462 	struct xe_pt_stage_unbind_walk xe_walk = {
1463 		.base = {
1464 			.ops = &xe_pt_stage_unbind_ops,
1465 			.shifts = xe_normal_pt_shifts,
1466 			.max_level = XE_PT_HIGHEST_LEVEL,
1467 		},
1468 		.tile = tile,
1469 		.modified_start = xe_vma_start(vma),
1470 		.modified_end = xe_vma_end(vma),
1471 		.wupd.entries = entries,
1472 	};
1473 	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
1474 
1475 	(void)xe_pt_walk_shared(&pt->base, pt->level, xe_vma_start(vma),
1476 				xe_vma_end(vma), &xe_walk.base);
1477 
1478 	return xe_walk.wupd.num_used_entries;
1479 }
1480 
1481 static void
1482 xe_migrate_clear_pgtable_callback(struct xe_migrate_pt_update *pt_update,
1483 				  struct xe_tile *tile, struct iosys_map *map,
1484 				  void *ptr, u32 qword_ofs, u32 num_qwords,
1485 				  const struct xe_vm_pgtable_update *update)
1486 {
1487 	struct xe_vma *vma = pt_update->vma;
1488 	u64 empty = __xe_pt_empty_pte(tile, xe_vma_vm(vma), update->pt->level);
1489 	int i;
1490 
1491 	if (map && map->is_iomem)
1492 		for (i = 0; i < num_qwords; ++i)
1493 			xe_map_wr(tile_to_xe(tile), map, (qword_ofs + i) *
1494 				  sizeof(u64), u64, empty);
1495 	else if (map)
1496 		memset64(map->vaddr + qword_ofs * sizeof(u64), empty,
1497 			 num_qwords);
1498 	else
1499 		memset64(ptr, empty, num_qwords);
1500 }
1501 
1502 static void
1503 xe_pt_commit_unbind(struct xe_vma *vma,
1504 		    struct xe_vm_pgtable_update *entries, u32 num_entries,
1505 		    struct llist_head *deferred)
1506 {
1507 	u32 j;
1508 
1509 	xe_pt_commit_locks_assert(vma);
1510 
1511 	for (j = 0; j < num_entries; ++j) {
1512 		struct xe_vm_pgtable_update *entry = &entries[j];
1513 		struct xe_pt *pt = entry->pt;
1514 
1515 		pt->num_live -= entry->qwords;
1516 		if (pt->level) {
1517 			struct xe_pt_dir *pt_dir = as_xe_pt_dir(pt);
1518 			u32 i;
1519 
1520 			for (i = entry->ofs; i < entry->ofs + entry->qwords;
1521 			     i++) {
1522 				if (xe_pt_entry(pt_dir, i))
1523 					xe_pt_destroy(xe_pt_entry(pt_dir, i),
1524 						      xe_vma_vm(vma)->flags, deferred);
1525 
1526 				pt_dir->children[i] = NULL;
1527 			}
1528 		}
1529 	}
1530 }
1531 
1532 static const struct xe_migrate_pt_update_ops unbind_ops = {
1533 	.populate = xe_migrate_clear_pgtable_callback,
1534 	.pre_commit = xe_pt_pre_commit,
1535 };
1536 
1537 static const struct xe_migrate_pt_update_ops userptr_unbind_ops = {
1538 	.populate = xe_migrate_clear_pgtable_callback,
1539 	.pre_commit = xe_pt_userptr_pre_commit,
1540 };
1541 
1542 /**
1543  * __xe_pt_unbind_vma() - Disconnect and free a page-table tree for the vma
1544  * address range.
1545  * @tile: The tile to unbind for.
1546  * @vma: The vma to unbind.
1547  * @q: The exec_queue with which to do pipelined page-table updates.
1548  * @syncs: Entries to sync on before disconnecting the tree to be destroyed.
1549  * @num_syncs: Number of @sync entries.
1550  *
1551  * This function builds a the xe_vm_pgtable_update entries abstracting the
1552  * operations needed to detach the page-table tree to be destroyed from the
1553  * man vm tree.
1554  * It then takes the relevant locks and submits the operations for
1555  * pipelined detachment of the gpu page-table from  the vm main tree,
1556  * (which can be done either by the cpu and the GPU), Finally it frees the
1557  * detached page-table tree.
1558  *
1559  * Return: A valid dma-fence representing the pipelined detachment operation
1560  * on success, an error pointer on error.
1561  */
1562 struct dma_fence *
1563 __xe_pt_unbind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue *q,
1564 		   struct xe_sync_entry *syncs, u32 num_syncs)
1565 {
1566 	struct xe_vm_pgtable_update entries[XE_VM_MAX_LEVEL * 2 + 1];
1567 	struct xe_pt_migrate_pt_update unbind_pt_update = {
1568 		.base = {
1569 			.ops = xe_vma_is_userptr(vma) ? &userptr_unbind_ops :
1570 			&unbind_ops,
1571 			.vma = vma,
1572 			.tile_id = tile->id,
1573 		},
1574 	};
1575 	struct xe_vm *vm = xe_vma_vm(vma);
1576 	u32 num_entries;
1577 	struct dma_fence *fence = NULL;
1578 	struct invalidation_fence *ifence;
1579 	struct xe_range_fence *rfence;
1580 
1581 	LLIST_HEAD(deferred);
1582 
1583 	xe_bo_assert_held(xe_vma_bo(vma));
1584 	xe_vm_assert_held(vm);
1585 
1586 	vm_dbg(&xe_vma_vm(vma)->xe->drm,
1587 	       "Preparing unbind, with range [%llx...%llx) engine %p.\n",
1588 	       xe_vma_start(vma), xe_vma_end(vma), q);
1589 
1590 	num_entries = xe_pt_stage_unbind(tile, vma, entries);
1591 	xe_tile_assert(tile, num_entries <= ARRAY_SIZE(entries));
1592 
1593 	xe_vm_dbg_print_entries(tile_to_xe(tile), entries, num_entries);
1594 	xe_pt_calc_rfence_interval(vma, &unbind_pt_update, entries,
1595 				   num_entries);
1596 
1597 	ifence = kzalloc(sizeof(*ifence), GFP_KERNEL);
1598 	if (!ifence)
1599 		return ERR_PTR(-ENOMEM);
1600 
1601 	rfence = kzalloc(sizeof(*rfence), GFP_KERNEL);
1602 	if (!rfence) {
1603 		kfree(ifence);
1604 		return ERR_PTR(-ENOMEM);
1605 	}
1606 
1607 	/*
1608 	 * Even if we were already evicted and unbind to destroy, we need to
1609 	 * clear again here. The eviction may have updated pagetables at a
1610 	 * lower level, because it needs to be more conservative.
1611 	 */
1612 	fence = xe_migrate_update_pgtables(tile->migrate,
1613 					   vm, NULL, q ? q :
1614 					   vm->q[tile->id],
1615 					   entries, num_entries,
1616 					   syncs, num_syncs,
1617 					   &unbind_pt_update.base);
1618 	if (!IS_ERR(fence)) {
1619 		int err;
1620 
1621 		err = xe_range_fence_insert(&vm->rftree[tile->id], rfence,
1622 					    &xe_range_fence_kfree_ops,
1623 					    unbind_pt_update.base.start,
1624 					    unbind_pt_update.base.last, fence);
1625 		if (err)
1626 			dma_fence_wait(fence, false);
1627 
1628 		/* TLB invalidation must be done before signaling unbind */
1629 		err = invalidation_fence_init(tile->primary_gt, ifence, fence, vma);
1630 		if (err) {
1631 			dma_fence_put(fence);
1632 			kfree(ifence);
1633 			return ERR_PTR(err);
1634 		}
1635 		fence = &ifence->base.base;
1636 
1637 		/* add shared fence now for pagetable delayed destroy */
1638 		dma_resv_add_fence(xe_vm_resv(vm), fence,
1639 				   DMA_RESV_USAGE_BOOKKEEP);
1640 
1641 		/* This fence will be installed by caller when doing eviction */
1642 		if (!xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm)
1643 			dma_resv_add_fence(xe_vma_bo(vma)->ttm.base.resv, fence,
1644 					   DMA_RESV_USAGE_BOOKKEEP);
1645 		xe_pt_commit_unbind(vma, entries, num_entries,
1646 				    unbind_pt_update.locked ? &deferred : NULL);
1647 		vma->tile_present &= ~BIT(tile->id);
1648 	} else {
1649 		kfree(rfence);
1650 		kfree(ifence);
1651 	}
1652 
1653 	if (!vma->tile_present)
1654 		list_del_init(&vma->combined_links.rebind);
1655 
1656 	if (unbind_pt_update.locked) {
1657 		xe_tile_assert(tile, xe_vma_is_userptr(vma));
1658 
1659 		if (!vma->tile_present) {
1660 			spin_lock(&vm->userptr.invalidated_lock);
1661 			list_del_init(&to_userptr_vma(vma)->userptr.invalidate_link);
1662 			spin_unlock(&vm->userptr.invalidated_lock);
1663 		}
1664 		up_read(&vm->userptr.notifier_lock);
1665 		xe_bo_put_commit(&deferred);
1666 	}
1667 
1668 	return fence;
1669 }
1670