xref: /linux/drivers/gpu/drm/xe/xe_pt.c (revision 173b0b5b0e865348684c02bd9cb1d22b5d46e458)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2022 Intel Corporation
4  */
5 
6 #include "xe_pt.h"
7 
8 #include "xe_bo.h"
9 #include "xe_device.h"
10 #include "xe_drm_client.h"
11 #include "xe_gt.h"
12 #include "xe_gt_tlb_invalidation.h"
13 #include "xe_migrate.h"
14 #include "xe_pt_types.h"
15 #include "xe_pt_walk.h"
16 #include "xe_res_cursor.h"
17 #include "xe_trace.h"
18 #include "xe_ttm_stolen_mgr.h"
19 #include "xe_vm.h"
20 
21 struct xe_pt_dir {
22 	struct xe_pt pt;
23 	/** @children: Array of page-table child nodes */
24 	struct xe_ptw *children[XE_PDES];
25 };
26 
27 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
28 #define xe_pt_set_addr(__xe_pt, __addr) ((__xe_pt)->addr = (__addr))
29 #define xe_pt_addr(__xe_pt) ((__xe_pt)->addr)
30 #else
31 #define xe_pt_set_addr(__xe_pt, __addr)
32 #define xe_pt_addr(__xe_pt) 0ull
33 #endif
34 
35 static const u64 xe_normal_pt_shifts[] = {12, 21, 30, 39, 48};
36 static const u64 xe_compact_pt_shifts[] = {16, 21, 30, 39, 48};
37 
38 #define XE_PT_HIGHEST_LEVEL (ARRAY_SIZE(xe_normal_pt_shifts) - 1)
39 
40 static struct xe_pt_dir *as_xe_pt_dir(struct xe_pt *pt)
41 {
42 	return container_of(pt, struct xe_pt_dir, pt);
43 }
44 
45 static struct xe_pt *xe_pt_entry(struct xe_pt_dir *pt_dir, unsigned int index)
46 {
47 	return container_of(pt_dir->children[index], struct xe_pt, base);
48 }
49 
50 static u64 __xe_pt_empty_pte(struct xe_tile *tile, struct xe_vm *vm,
51 			     unsigned int level)
52 {
53 	struct xe_device *xe = tile_to_xe(tile);
54 	u16 pat_index = xe->pat.idx[XE_CACHE_WB];
55 	u8 id = tile->id;
56 
57 	if (!xe_vm_has_scratch(vm))
58 		return 0;
59 
60 	if (level > MAX_HUGEPTE_LEVEL)
61 		return vm->pt_ops->pde_encode_bo(vm->scratch_pt[id][level - 1]->bo,
62 						 0, pat_index);
63 
64 	return vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, IS_DGFX(xe), 0) |
65 		XE_PTE_NULL;
66 }
67 
68 static void xe_pt_free(struct xe_pt *pt)
69 {
70 	if (pt->level)
71 		kfree(as_xe_pt_dir(pt));
72 	else
73 		kfree(pt);
74 }
75 
76 /**
77  * xe_pt_create() - Create a page-table.
78  * @vm: The vm to create for.
79  * @tile: The tile to create for.
80  * @level: The page-table level.
81  *
82  * Allocate and initialize a single struct xe_pt metadata structure. Also
83  * create the corresponding page-table bo, but don't initialize it. If the
84  * level is grater than zero, then it's assumed to be a directory page-
85  * table and the directory structure is also allocated and initialized to
86  * NULL pointers.
87  *
88  * Return: A valid struct xe_pt pointer on success, Pointer error code on
89  * error.
90  */
91 struct xe_pt *xe_pt_create(struct xe_vm *vm, struct xe_tile *tile,
92 			   unsigned int level)
93 {
94 	struct xe_pt *pt;
95 	struct xe_bo *bo;
96 	int err;
97 
98 	if (level) {
99 		struct xe_pt_dir *dir = kzalloc(sizeof(*dir), GFP_KERNEL);
100 
101 		pt = (dir) ? &dir->pt : NULL;
102 	} else {
103 		pt = kzalloc(sizeof(*pt), GFP_KERNEL);
104 	}
105 	if (!pt)
106 		return ERR_PTR(-ENOMEM);
107 
108 	pt->level = level;
109 	bo = xe_bo_create_pin_map(vm->xe, tile, vm, SZ_4K,
110 				  ttm_bo_type_kernel,
111 				  XE_BO_CREATE_VRAM_IF_DGFX(tile) |
112 				  XE_BO_CREATE_IGNORE_MIN_PAGE_SIZE_BIT |
113 				  XE_BO_CREATE_PINNED_BIT |
114 				  XE_BO_CREATE_NO_RESV_EVICT |
115 				  XE_BO_PAGETABLE);
116 	if (IS_ERR(bo)) {
117 		err = PTR_ERR(bo);
118 		goto err_kfree;
119 	}
120 	pt->bo = bo;
121 	pt->base.children = level ? as_xe_pt_dir(pt)->children : NULL;
122 
123 	if (vm->xef)
124 		xe_drm_client_add_bo(vm->xef->client, pt->bo);
125 	xe_tile_assert(tile, level <= XE_VM_MAX_LEVEL);
126 
127 	return pt;
128 
129 err_kfree:
130 	xe_pt_free(pt);
131 	return ERR_PTR(err);
132 }
133 
134 /**
135  * xe_pt_populate_empty() - Populate a page-table bo with scratch- or zero
136  * entries.
137  * @tile: The tile the scratch pagetable of which to use.
138  * @vm: The vm we populate for.
139  * @pt: The pagetable the bo of which to initialize.
140  *
141  * Populate the page-table bo of @pt with entries pointing into the tile's
142  * scratch page-table tree if any. Otherwise populate with zeros.
143  */
144 void xe_pt_populate_empty(struct xe_tile *tile, struct xe_vm *vm,
145 			  struct xe_pt *pt)
146 {
147 	struct iosys_map *map = &pt->bo->vmap;
148 	u64 empty;
149 	int i;
150 
151 	if (!xe_vm_has_scratch(vm)) {
152 		/*
153 		 * FIXME: Some memory is allocated already allocated to zero?
154 		 * Find out which memory that is and avoid this memset...
155 		 */
156 		xe_map_memset(vm->xe, map, 0, 0, SZ_4K);
157 	} else {
158 		empty = __xe_pt_empty_pte(tile, vm, pt->level);
159 		for (i = 0; i < XE_PDES; i++)
160 			xe_pt_write(vm->xe, map, i, empty);
161 	}
162 }
163 
164 /**
165  * xe_pt_shift() - Return the ilog2 value of the size of the address range of
166  * a page-table at a certain level.
167  * @level: The level.
168  *
169  * Return: The ilog2 value of the size of the address range of a page-table
170  * at level @level.
171  */
172 unsigned int xe_pt_shift(unsigned int level)
173 {
174 	return XE_PTE_SHIFT + XE_PDE_SHIFT * level;
175 }
176 
177 /**
178  * xe_pt_destroy() - Destroy a page-table tree.
179  * @pt: The root of the page-table tree to destroy.
180  * @flags: vm flags. Currently unused.
181  * @deferred: List head of lockless list for deferred putting. NULL for
182  *            immediate putting.
183  *
184  * Puts the page-table bo, recursively calls xe_pt_destroy on all children
185  * and finally frees @pt. TODO: Can we remove the @flags argument?
186  */
187 void xe_pt_destroy(struct xe_pt *pt, u32 flags, struct llist_head *deferred)
188 {
189 	int i;
190 
191 	if (!pt)
192 		return;
193 
194 	XE_WARN_ON(!list_empty(&pt->bo->ttm.base.gpuva.list));
195 	xe_bo_unpin(pt->bo);
196 	xe_bo_put_deferred(pt->bo, deferred);
197 
198 	if (pt->level > 0 && pt->num_live) {
199 		struct xe_pt_dir *pt_dir = as_xe_pt_dir(pt);
200 
201 		for (i = 0; i < XE_PDES; i++) {
202 			if (xe_pt_entry(pt_dir, i))
203 				xe_pt_destroy(xe_pt_entry(pt_dir, i), flags,
204 					      deferred);
205 		}
206 	}
207 	xe_pt_free(pt);
208 }
209 
210 /**
211  * DOC: Pagetable building
212  *
213  * Below we use the term "page-table" for both page-directories, containing
214  * pointers to lower level page-directories or page-tables, and level 0
215  * page-tables that contain only page-table-entries pointing to memory pages.
216  *
217  * When inserting an address range in an already existing page-table tree
218  * there will typically be a set of page-tables that are shared with other
219  * address ranges, and a set that are private to this address range.
220  * The set of shared page-tables can be at most two per level,
221  * and those can't be updated immediately because the entries of those
222  * page-tables may still be in use by the gpu for other mappings. Therefore
223  * when inserting entries into those, we instead stage those insertions by
224  * adding insertion data into struct xe_vm_pgtable_update structures. This
225  * data, (subtrees for the cpu and page-table-entries for the gpu) is then
226  * added in a separate commit step. CPU-data is committed while still under the
227  * vm lock, the object lock and for userptr, the notifier lock in read mode.
228  * The GPU async data is committed either by the GPU or CPU after fulfilling
229  * relevant dependencies.
230  * For non-shared page-tables (and, in fact, for shared ones that aren't
231  * existing at the time of staging), we add the data in-place without the
232  * special update structures. This private part of the page-table tree will
233  * remain disconnected from the vm page-table tree until data is committed to
234  * the shared page tables of the vm tree in the commit phase.
235  */
236 
237 struct xe_pt_update {
238 	/** @update: The update structure we're building for this parent. */
239 	struct xe_vm_pgtable_update *update;
240 	/** @parent: The parent. Used to detect a parent change. */
241 	struct xe_pt *parent;
242 	/** @preexisting: Whether the parent was pre-existing or allocated */
243 	bool preexisting;
244 };
245 
246 struct xe_pt_stage_bind_walk {
247 	/** base: The base class. */
248 	struct xe_pt_walk base;
249 
250 	/* Input parameters for the walk */
251 	/** @vm: The vm we're building for. */
252 	struct xe_vm *vm;
253 	/** @tile: The tile we're building for. */
254 	struct xe_tile *tile;
255 	/** @default_pte: PTE flag only template. No address is associated */
256 	u64 default_pte;
257 	/** @dma_offset: DMA offset to add to the PTE. */
258 	u64 dma_offset;
259 	/**
260 	 * @needs_64k: This address range enforces 64K alignment and
261 	 * granularity.
262 	 */
263 	bool needs_64K;
264 	/**
265 	 * @vma: VMA being mapped
266 	 */
267 	struct xe_vma *vma;
268 
269 	/* Also input, but is updated during the walk*/
270 	/** @curs: The DMA address cursor. */
271 	struct xe_res_cursor *curs;
272 	/** @va_curs_start: The Virtual address coresponding to @curs->start */
273 	u64 va_curs_start;
274 
275 	/* Output */
276 	struct xe_walk_update {
277 		/** @wupd.entries: Caller provided storage. */
278 		struct xe_vm_pgtable_update *entries;
279 		/** @wupd.num_used_entries: Number of update @entries used. */
280 		unsigned int num_used_entries;
281 		/** @wupd.updates: Tracks the update entry at a given level */
282 		struct xe_pt_update updates[XE_VM_MAX_LEVEL + 1];
283 	} wupd;
284 
285 	/* Walk state */
286 	/**
287 	 * @l0_end_addr: The end address of the current l0 leaf. Used for
288 	 * 64K granularity detection.
289 	 */
290 	u64 l0_end_addr;
291 	/** @addr_64K: The start address of the current 64K chunk. */
292 	u64 addr_64K;
293 	/** @found_64: Whether @add_64K actually points to a 64K chunk. */
294 	bool found_64K;
295 };
296 
297 static int
298 xe_pt_new_shared(struct xe_walk_update *wupd, struct xe_pt *parent,
299 		 pgoff_t offset, bool alloc_entries)
300 {
301 	struct xe_pt_update *upd = &wupd->updates[parent->level];
302 	struct xe_vm_pgtable_update *entry;
303 
304 	/*
305 	 * For *each level*, we could only have one active
306 	 * struct xt_pt_update at any one time. Once we move on to a
307 	 * new parent and page-directory, the old one is complete, and
308 	 * updates are either already stored in the build tree or in
309 	 * @wupd->entries
310 	 */
311 	if (likely(upd->parent == parent))
312 		return 0;
313 
314 	upd->parent = parent;
315 	upd->preexisting = true;
316 
317 	if (wupd->num_used_entries == XE_VM_MAX_LEVEL * 2 + 1)
318 		return -EINVAL;
319 
320 	entry = wupd->entries + wupd->num_used_entries++;
321 	upd->update = entry;
322 	entry->ofs = offset;
323 	entry->pt_bo = parent->bo;
324 	entry->pt = parent;
325 	entry->flags = 0;
326 	entry->qwords = 0;
327 
328 	if (alloc_entries) {
329 		entry->pt_entries = kmalloc_array(XE_PDES,
330 						  sizeof(*entry->pt_entries),
331 						  GFP_KERNEL);
332 		if (!entry->pt_entries)
333 			return -ENOMEM;
334 	}
335 
336 	return 0;
337 }
338 
339 /*
340  * NOTE: This is a very frequently called function so we allow ourselves
341  * to annotate (using branch prediction hints) the fastpath of updating a
342  * non-pre-existing pagetable with leaf ptes.
343  */
344 static int
345 xe_pt_insert_entry(struct xe_pt_stage_bind_walk *xe_walk, struct xe_pt *parent,
346 		   pgoff_t offset, struct xe_pt *xe_child, u64 pte)
347 {
348 	struct xe_pt_update *upd = &xe_walk->wupd.updates[parent->level];
349 	struct xe_pt_update *child_upd = xe_child ?
350 		&xe_walk->wupd.updates[xe_child->level] : NULL;
351 	int ret;
352 
353 	ret = xe_pt_new_shared(&xe_walk->wupd, parent, offset, true);
354 	if (unlikely(ret))
355 		return ret;
356 
357 	/*
358 	 * Register this new pagetable so that it won't be recognized as
359 	 * a shared pagetable by a subsequent insertion.
360 	 */
361 	if (unlikely(child_upd)) {
362 		child_upd->update = NULL;
363 		child_upd->parent = xe_child;
364 		child_upd->preexisting = false;
365 	}
366 
367 	if (likely(!upd->preexisting)) {
368 		/* Continue building a non-connected subtree. */
369 		struct iosys_map *map = &parent->bo->vmap;
370 
371 		if (unlikely(xe_child))
372 			parent->base.children[offset] = &xe_child->base;
373 
374 		xe_pt_write(xe_walk->vm->xe, map, offset, pte);
375 		parent->num_live++;
376 	} else {
377 		/* Shared pt. Stage update. */
378 		unsigned int idx;
379 		struct xe_vm_pgtable_update *entry = upd->update;
380 
381 		idx = offset - entry->ofs;
382 		entry->pt_entries[idx].pt = xe_child;
383 		entry->pt_entries[idx].pte = pte;
384 		entry->qwords++;
385 	}
386 
387 	return 0;
388 }
389 
390 static bool xe_pt_hugepte_possible(u64 addr, u64 next, unsigned int level,
391 				   struct xe_pt_stage_bind_walk *xe_walk)
392 {
393 	u64 size, dma;
394 
395 	if (level > MAX_HUGEPTE_LEVEL)
396 		return false;
397 
398 	/* Does the virtual range requested cover a huge pte? */
399 	if (!xe_pt_covers(addr, next, level, &xe_walk->base))
400 		return false;
401 
402 	/* Does the DMA segment cover the whole pte? */
403 	if (next - xe_walk->va_curs_start > xe_walk->curs->size)
404 		return false;
405 
406 	/* null VMA's do not have dma addresses */
407 	if (xe_vma_is_null(xe_walk->vma))
408 		return true;
409 
410 	/* Is the DMA address huge PTE size aligned? */
411 	size = next - addr;
412 	dma = addr - xe_walk->va_curs_start + xe_res_dma(xe_walk->curs);
413 
414 	return IS_ALIGNED(dma, size);
415 }
416 
417 /*
418  * Scan the requested mapping to check whether it can be done entirely
419  * with 64K PTEs.
420  */
421 static bool
422 xe_pt_scan_64K(u64 addr, u64 next, struct xe_pt_stage_bind_walk *xe_walk)
423 {
424 	struct xe_res_cursor curs = *xe_walk->curs;
425 
426 	if (!IS_ALIGNED(addr, SZ_64K))
427 		return false;
428 
429 	if (next > xe_walk->l0_end_addr)
430 		return false;
431 
432 	/* null VMA's do not have dma addresses */
433 	if (xe_vma_is_null(xe_walk->vma))
434 		return true;
435 
436 	xe_res_next(&curs, addr - xe_walk->va_curs_start);
437 	for (; addr < next; addr += SZ_64K) {
438 		if (!IS_ALIGNED(xe_res_dma(&curs), SZ_64K) || curs.size < SZ_64K)
439 			return false;
440 
441 		xe_res_next(&curs, SZ_64K);
442 	}
443 
444 	return addr == next;
445 }
446 
447 /*
448  * For non-compact "normal" 4K level-0 pagetables, we want to try to group
449  * addresses together in 64K-contigous regions to add a 64K TLB hint for the
450  * device to the PTE.
451  * This function determines whether the address is part of such a
452  * segment. For VRAM in normal pagetables, this is strictly necessary on
453  * some devices.
454  */
455 static bool
456 xe_pt_is_pte_ps64K(u64 addr, u64 next, struct xe_pt_stage_bind_walk *xe_walk)
457 {
458 	/* Address is within an already found 64k region */
459 	if (xe_walk->found_64K && addr - xe_walk->addr_64K < SZ_64K)
460 		return true;
461 
462 	xe_walk->found_64K = xe_pt_scan_64K(addr, addr + SZ_64K, xe_walk);
463 	xe_walk->addr_64K = addr;
464 
465 	return xe_walk->found_64K;
466 }
467 
468 static int
469 xe_pt_stage_bind_entry(struct xe_ptw *parent, pgoff_t offset,
470 		       unsigned int level, u64 addr, u64 next,
471 		       struct xe_ptw **child,
472 		       enum page_walk_action *action,
473 		       struct xe_pt_walk *walk)
474 {
475 	struct xe_pt_stage_bind_walk *xe_walk =
476 		container_of(walk, typeof(*xe_walk), base);
477 	u16 pat_index = xe_walk->vma->pat_index;
478 	struct xe_pt *xe_parent = container_of(parent, typeof(*xe_parent), base);
479 	struct xe_vm *vm = xe_walk->vm;
480 	struct xe_pt *xe_child;
481 	bool covers;
482 	int ret = 0;
483 	u64 pte;
484 
485 	/* Is this a leaf entry ?*/
486 	if (level == 0 || xe_pt_hugepte_possible(addr, next, level, xe_walk)) {
487 		struct xe_res_cursor *curs = xe_walk->curs;
488 		bool is_null = xe_vma_is_null(xe_walk->vma);
489 
490 		XE_WARN_ON(xe_walk->va_curs_start != addr);
491 
492 		pte = vm->pt_ops->pte_encode_vma(is_null ? 0 :
493 						 xe_res_dma(curs) + xe_walk->dma_offset,
494 						 xe_walk->vma, pat_index, level);
495 		pte |= xe_walk->default_pte;
496 
497 		/*
498 		 * Set the XE_PTE_PS64 hint if possible, otherwise if
499 		 * this device *requires* 64K PTE size for VRAM, fail.
500 		 */
501 		if (level == 0 && !xe_parent->is_compact) {
502 			if (xe_pt_is_pte_ps64K(addr, next, xe_walk)) {
503 				xe_walk->vma->gpuva.flags |= XE_VMA_PTE_64K;
504 				pte |= XE_PTE_PS64;
505 			} else if (XE_WARN_ON(xe_walk->needs_64K)) {
506 				return -EINVAL;
507 			}
508 		}
509 
510 		ret = xe_pt_insert_entry(xe_walk, xe_parent, offset, NULL, pte);
511 		if (unlikely(ret))
512 			return ret;
513 
514 		if (!is_null)
515 			xe_res_next(curs, next - addr);
516 		xe_walk->va_curs_start = next;
517 		xe_walk->vma->gpuva.flags |= (XE_VMA_PTE_4K << level);
518 		*action = ACTION_CONTINUE;
519 
520 		return ret;
521 	}
522 
523 	/*
524 	 * Descending to lower level. Determine if we need to allocate a
525 	 * new page table or -directory, which we do if there is no
526 	 * previous one or there is one we can completely replace.
527 	 */
528 	if (level == 1) {
529 		walk->shifts = xe_normal_pt_shifts;
530 		xe_walk->l0_end_addr = next;
531 	}
532 
533 	covers = xe_pt_covers(addr, next, level, &xe_walk->base);
534 	if (covers || !*child) {
535 		u64 flags = 0;
536 
537 		xe_child = xe_pt_create(xe_walk->vm, xe_walk->tile, level - 1);
538 		if (IS_ERR(xe_child))
539 			return PTR_ERR(xe_child);
540 
541 		xe_pt_set_addr(xe_child,
542 			       round_down(addr, 1ull << walk->shifts[level]));
543 
544 		if (!covers)
545 			xe_pt_populate_empty(xe_walk->tile, xe_walk->vm, xe_child);
546 
547 		*child = &xe_child->base;
548 
549 		/*
550 		 * Prefer the compact pagetable layout for L0 if possible. Only
551 		 * possible if VMA covers entire 2MB region as compact 64k and
552 		 * 4k pages cannot be mixed within a 2MB region.
553 		 * TODO: Suballocate the pt bo to avoid wasting a lot of
554 		 * memory.
555 		 */
556 		if (GRAPHICS_VERx100(tile_to_xe(xe_walk->tile)) >= 1250 && level == 1 &&
557 		    covers && xe_pt_scan_64K(addr, next, xe_walk)) {
558 			walk->shifts = xe_compact_pt_shifts;
559 			xe_walk->vma->gpuva.flags |= XE_VMA_PTE_COMPACT;
560 			flags |= XE_PDE_64K;
561 			xe_child->is_compact = true;
562 		}
563 
564 		pte = vm->pt_ops->pde_encode_bo(xe_child->bo, 0, pat_index) | flags;
565 		ret = xe_pt_insert_entry(xe_walk, xe_parent, offset, xe_child,
566 					 pte);
567 	}
568 
569 	*action = ACTION_SUBTREE;
570 	return ret;
571 }
572 
573 static const struct xe_pt_walk_ops xe_pt_stage_bind_ops = {
574 	.pt_entry = xe_pt_stage_bind_entry,
575 };
576 
577 /**
578  * xe_pt_stage_bind() - Build a disconnected page-table tree for a given address
579  * range.
580  * @tile: The tile we're building for.
581  * @vma: The vma indicating the address range.
582  * @entries: Storage for the update entries used for connecting the tree to
583  * the main tree at commit time.
584  * @num_entries: On output contains the number of @entries used.
585  *
586  * This function builds a disconnected page-table tree for a given address
587  * range. The tree is connected to the main vm tree for the gpu using
588  * xe_migrate_update_pgtables() and for the cpu using xe_pt_commit_bind().
589  * The function builds xe_vm_pgtable_update structures for already existing
590  * shared page-tables, and non-existing shared and non-shared page-tables
591  * are built and populated directly.
592  *
593  * Return 0 on success, negative error code on error.
594  */
595 static int
596 xe_pt_stage_bind(struct xe_tile *tile, struct xe_vma *vma,
597 		 struct xe_vm_pgtable_update *entries, u32 *num_entries)
598 {
599 	struct xe_device *xe = tile_to_xe(tile);
600 	struct xe_bo *bo = xe_vma_bo(vma);
601 	bool is_devmem = !xe_vma_is_userptr(vma) && bo &&
602 		(xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo));
603 	struct xe_res_cursor curs;
604 	struct xe_pt_stage_bind_walk xe_walk = {
605 		.base = {
606 			.ops = &xe_pt_stage_bind_ops,
607 			.shifts = xe_normal_pt_shifts,
608 			.max_level = XE_PT_HIGHEST_LEVEL,
609 		},
610 		.vm = xe_vma_vm(vma),
611 		.tile = tile,
612 		.curs = &curs,
613 		.va_curs_start = xe_vma_start(vma),
614 		.vma = vma,
615 		.wupd.entries = entries,
616 		.needs_64K = (xe_vma_vm(vma)->flags & XE_VM_FLAG_64K) && is_devmem,
617 	};
618 	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
619 	int ret;
620 
621 	if (vma && (vma->gpuva.flags & XE_VMA_ATOMIC_PTE_BIT) &&
622 	    (is_devmem || !IS_DGFX(xe)))
623 		xe_walk.default_pte |= XE_USM_PPGTT_PTE_AE;
624 
625 	if (is_devmem) {
626 		xe_walk.default_pte |= XE_PPGTT_PTE_DM;
627 		xe_walk.dma_offset = vram_region_gpu_offset(bo->ttm.resource);
628 	}
629 
630 	if (!xe_vma_has_no_bo(vma) && xe_bo_is_stolen(bo))
631 		xe_walk.dma_offset = xe_ttm_stolen_gpu_offset(xe_bo_device(bo));
632 
633 	xe_bo_assert_held(bo);
634 
635 	if (!xe_vma_is_null(vma)) {
636 		if (xe_vma_is_userptr(vma))
637 			xe_res_first_sg(to_userptr_vma(vma)->userptr.sg, 0,
638 					xe_vma_size(vma), &curs);
639 		else if (xe_bo_is_vram(bo) || xe_bo_is_stolen(bo))
640 			xe_res_first(bo->ttm.resource, xe_vma_bo_offset(vma),
641 				     xe_vma_size(vma), &curs);
642 		else
643 			xe_res_first_sg(xe_bo_sg(bo), xe_vma_bo_offset(vma),
644 					xe_vma_size(vma), &curs);
645 	} else {
646 		curs.size = xe_vma_size(vma);
647 	}
648 
649 	ret = xe_pt_walk_range(&pt->base, pt->level, xe_vma_start(vma),
650 			       xe_vma_end(vma), &xe_walk.base);
651 
652 	*num_entries = xe_walk.wupd.num_used_entries;
653 	return ret;
654 }
655 
656 /**
657  * xe_pt_nonshared_offsets() - Determine the non-shared entry offsets of a
658  * shared pagetable.
659  * @addr: The start address within the non-shared pagetable.
660  * @end: The end address within the non-shared pagetable.
661  * @level: The level of the non-shared pagetable.
662  * @walk: Walk info. The function adjusts the walk action.
663  * @action: next action to perform (see enum page_walk_action)
664  * @offset: Ignored on input, First non-shared entry on output.
665  * @end_offset: Ignored on input, Last non-shared entry + 1 on output.
666  *
667  * A non-shared page-table has some entries that belong to the address range
668  * and others that don't. This function determines the entries that belong
669  * fully to the address range. Depending on level, some entries may
670  * partially belong to the address range (that can't happen at level 0).
671  * The function detects that and adjust those offsets to not include those
672  * partial entries. Iff it does detect partial entries, we know that there must
673  * be shared page tables also at lower levels, so it adjusts the walk action
674  * accordingly.
675  *
676  * Return: true if there were non-shared entries, false otherwise.
677  */
678 static bool xe_pt_nonshared_offsets(u64 addr, u64 end, unsigned int level,
679 				    struct xe_pt_walk *walk,
680 				    enum page_walk_action *action,
681 				    pgoff_t *offset, pgoff_t *end_offset)
682 {
683 	u64 size = 1ull << walk->shifts[level];
684 
685 	*offset = xe_pt_offset(addr, level, walk);
686 	*end_offset = xe_pt_num_entries(addr, end, level, walk) + *offset;
687 
688 	if (!level)
689 		return true;
690 
691 	/*
692 	 * If addr or next are not size aligned, there are shared pts at lower
693 	 * level, so in that case traverse down the subtree
694 	 */
695 	*action = ACTION_CONTINUE;
696 	if (!IS_ALIGNED(addr, size)) {
697 		*action = ACTION_SUBTREE;
698 		(*offset)++;
699 	}
700 
701 	if (!IS_ALIGNED(end, size)) {
702 		*action = ACTION_SUBTREE;
703 		(*end_offset)--;
704 	}
705 
706 	return *end_offset > *offset;
707 }
708 
709 struct xe_pt_zap_ptes_walk {
710 	/** @base: The walk base-class */
711 	struct xe_pt_walk base;
712 
713 	/* Input parameters for the walk */
714 	/** @tile: The tile we're building for */
715 	struct xe_tile *tile;
716 
717 	/* Output */
718 	/** @needs_invalidate: Whether we need to invalidate TLB*/
719 	bool needs_invalidate;
720 };
721 
722 static int xe_pt_zap_ptes_entry(struct xe_ptw *parent, pgoff_t offset,
723 				unsigned int level, u64 addr, u64 next,
724 				struct xe_ptw **child,
725 				enum page_walk_action *action,
726 				struct xe_pt_walk *walk)
727 {
728 	struct xe_pt_zap_ptes_walk *xe_walk =
729 		container_of(walk, typeof(*xe_walk), base);
730 	struct xe_pt *xe_child = container_of(*child, typeof(*xe_child), base);
731 	pgoff_t end_offset;
732 
733 	XE_WARN_ON(!*child);
734 	XE_WARN_ON(!level && xe_child->is_compact);
735 
736 	/*
737 	 * Note that we're called from an entry callback, and we're dealing
738 	 * with the child of that entry rather than the parent, so need to
739 	 * adjust level down.
740 	 */
741 	if (xe_pt_nonshared_offsets(addr, next, --level, walk, action, &offset,
742 				    &end_offset)) {
743 		xe_map_memset(tile_to_xe(xe_walk->tile), &xe_child->bo->vmap,
744 			      offset * sizeof(u64), 0,
745 			      (end_offset - offset) * sizeof(u64));
746 		xe_walk->needs_invalidate = true;
747 	}
748 
749 	return 0;
750 }
751 
752 static const struct xe_pt_walk_ops xe_pt_zap_ptes_ops = {
753 	.pt_entry = xe_pt_zap_ptes_entry,
754 };
755 
756 /**
757  * xe_pt_zap_ptes() - Zap (zero) gpu ptes of an address range
758  * @tile: The tile we're zapping for.
759  * @vma: GPU VMA detailing address range.
760  *
761  * Eviction and Userptr invalidation needs to be able to zap the
762  * gpu ptes of a given address range in pagefaulting mode.
763  * In order to be able to do that, that function needs access to the shared
764  * page-table entrieaso it can either clear the leaf PTEs or
765  * clear the pointers to lower-level page-tables. The caller is required
766  * to hold the necessary locks to ensure neither the page-table connectivity
767  * nor the page-table entries of the range is updated from under us.
768  *
769  * Return: Whether ptes were actually updated and a TLB invalidation is
770  * required.
771  */
772 bool xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma)
773 {
774 	struct xe_pt_zap_ptes_walk xe_walk = {
775 		.base = {
776 			.ops = &xe_pt_zap_ptes_ops,
777 			.shifts = xe_normal_pt_shifts,
778 			.max_level = XE_PT_HIGHEST_LEVEL,
779 		},
780 		.tile = tile,
781 	};
782 	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
783 
784 	if (!(vma->tile_present & BIT(tile->id)))
785 		return false;
786 
787 	(void)xe_pt_walk_shared(&pt->base, pt->level, xe_vma_start(vma),
788 				xe_vma_end(vma), &xe_walk.base);
789 
790 	return xe_walk.needs_invalidate;
791 }
792 
793 static void
794 xe_vm_populate_pgtable(struct xe_migrate_pt_update *pt_update, struct xe_tile *tile,
795 		       struct iosys_map *map, void *data,
796 		       u32 qword_ofs, u32 num_qwords,
797 		       const struct xe_vm_pgtable_update *update)
798 {
799 	struct xe_pt_entry *ptes = update->pt_entries;
800 	u64 *ptr = data;
801 	u32 i;
802 
803 	for (i = 0; i < num_qwords; i++) {
804 		if (map)
805 			xe_map_wr(tile_to_xe(tile), map, (qword_ofs + i) *
806 				  sizeof(u64), u64, ptes[i].pte);
807 		else
808 			ptr[i] = ptes[i].pte;
809 	}
810 }
811 
812 static void xe_pt_abort_bind(struct xe_vma *vma,
813 			     struct xe_vm_pgtable_update *entries,
814 			     u32 num_entries)
815 {
816 	u32 i, j;
817 
818 	for (i = 0; i < num_entries; i++) {
819 		if (!entries[i].pt_entries)
820 			continue;
821 
822 		for (j = 0; j < entries[i].qwords; j++)
823 			xe_pt_destroy(entries[i].pt_entries[j].pt, xe_vma_vm(vma)->flags, NULL);
824 		kfree(entries[i].pt_entries);
825 	}
826 }
827 
828 static void xe_pt_commit_locks_assert(struct xe_vma *vma)
829 {
830 	struct xe_vm *vm = xe_vma_vm(vma);
831 
832 	lockdep_assert_held(&vm->lock);
833 
834 	if (xe_vma_is_userptr(vma))
835 		lockdep_assert_held_read(&vm->userptr.notifier_lock);
836 	else if (!xe_vma_is_null(vma))
837 		dma_resv_assert_held(xe_vma_bo(vma)->ttm.base.resv);
838 
839 	xe_vm_assert_held(vm);
840 }
841 
842 static void xe_pt_commit_bind(struct xe_vma *vma,
843 			      struct xe_vm_pgtable_update *entries,
844 			      u32 num_entries, bool rebind,
845 			      struct llist_head *deferred)
846 {
847 	u32 i, j;
848 
849 	xe_pt_commit_locks_assert(vma);
850 
851 	for (i = 0; i < num_entries; i++) {
852 		struct xe_pt *pt = entries[i].pt;
853 		struct xe_pt_dir *pt_dir;
854 
855 		if (!rebind)
856 			pt->num_live += entries[i].qwords;
857 
858 		if (!pt->level) {
859 			kfree(entries[i].pt_entries);
860 			continue;
861 		}
862 
863 		pt_dir = as_xe_pt_dir(pt);
864 		for (j = 0; j < entries[i].qwords; j++) {
865 			u32 j_ = j + entries[i].ofs;
866 			struct xe_pt *newpte = entries[i].pt_entries[j].pt;
867 
868 			if (xe_pt_entry(pt_dir, j_))
869 				xe_pt_destroy(xe_pt_entry(pt_dir, j_),
870 					      xe_vma_vm(vma)->flags, deferred);
871 
872 			pt_dir->children[j_] = &newpte->base;
873 		}
874 		kfree(entries[i].pt_entries);
875 	}
876 }
877 
878 static int
879 xe_pt_prepare_bind(struct xe_tile *tile, struct xe_vma *vma,
880 		   struct xe_vm_pgtable_update *entries, u32 *num_entries)
881 {
882 	int err;
883 
884 	*num_entries = 0;
885 	err = xe_pt_stage_bind(tile, vma, entries, num_entries);
886 	if (!err)
887 		xe_tile_assert(tile, *num_entries);
888 	else /* abort! */
889 		xe_pt_abort_bind(vma, entries, *num_entries);
890 
891 	return err;
892 }
893 
894 static void xe_vm_dbg_print_entries(struct xe_device *xe,
895 				    const struct xe_vm_pgtable_update *entries,
896 				    unsigned int num_entries)
897 #if (IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM))
898 {
899 	unsigned int i;
900 
901 	vm_dbg(&xe->drm, "%u entries to update\n", num_entries);
902 	for (i = 0; i < num_entries; i++) {
903 		const struct xe_vm_pgtable_update *entry = &entries[i];
904 		struct xe_pt *xe_pt = entry->pt;
905 		u64 page_size = 1ull << xe_pt_shift(xe_pt->level);
906 		u64 end;
907 		u64 start;
908 
909 		xe_assert(xe, !entry->pt->is_compact);
910 		start = entry->ofs * page_size;
911 		end = start + page_size * entry->qwords;
912 		vm_dbg(&xe->drm,
913 		       "\t%u: Update level %u at (%u + %u) [%llx...%llx) f:%x\n",
914 		       i, xe_pt->level, entry->ofs, entry->qwords,
915 		       xe_pt_addr(xe_pt) + start, xe_pt_addr(xe_pt) + end, 0);
916 	}
917 }
918 #else
919 {}
920 #endif
921 
922 #ifdef CONFIG_DRM_XE_USERPTR_INVAL_INJECT
923 
924 static int xe_pt_userptr_inject_eagain(struct xe_userptr_vma *uvma)
925 {
926 	u32 divisor = uvma->userptr.divisor ? uvma->userptr.divisor : 2;
927 	static u32 count;
928 
929 	if (count++ % divisor == divisor - 1) {
930 		struct xe_vm *vm = xe_vma_vm(&uvma->vma);
931 
932 		uvma->userptr.divisor = divisor << 1;
933 		spin_lock(&vm->userptr.invalidated_lock);
934 		list_move_tail(&uvma->userptr.invalidate_link,
935 			       &vm->userptr.invalidated);
936 		spin_unlock(&vm->userptr.invalidated_lock);
937 		return true;
938 	}
939 
940 	return false;
941 }
942 
943 #else
944 
945 static bool xe_pt_userptr_inject_eagain(struct xe_userptr_vma *uvma)
946 {
947 	return false;
948 }
949 
950 #endif
951 
952 /**
953  * struct xe_pt_migrate_pt_update - Callback argument for pre-commit callbacks
954  * @base: Base we derive from.
955  * @bind: Whether this is a bind or an unbind operation. A bind operation
956  *        makes the pre-commit callback error with -EAGAIN if it detects a
957  *        pending invalidation.
958  * @locked: Whether the pre-commit callback locked the userptr notifier lock
959  *          and it needs unlocking.
960  */
961 struct xe_pt_migrate_pt_update {
962 	struct xe_migrate_pt_update base;
963 	bool bind;
964 	bool locked;
965 };
966 
967 /*
968  * This function adds the needed dependencies to a page-table update job
969  * to make sure racing jobs for separate bind engines don't race writing
970  * to the same page-table range, wreaking havoc. Initially use a single
971  * fence for the entire VM. An optimization would use smaller granularity.
972  */
973 static int xe_pt_vm_dependencies(struct xe_sched_job *job,
974 				 struct xe_range_fence_tree *rftree,
975 				 u64 start, u64 last)
976 {
977 	struct xe_range_fence *rtfence;
978 	struct dma_fence *fence;
979 	int err;
980 
981 	rtfence = xe_range_fence_tree_first(rftree, start, last);
982 	while (rtfence) {
983 		fence = rtfence->fence;
984 
985 		if (!dma_fence_is_signaled(fence)) {
986 			/*
987 			 * Is this a CPU update? GPU is busy updating, so return
988 			 * an error
989 			 */
990 			if (!job)
991 				return -ETIME;
992 
993 			dma_fence_get(fence);
994 			err = drm_sched_job_add_dependency(&job->drm, fence);
995 			if (err)
996 				return err;
997 		}
998 
999 		rtfence = xe_range_fence_tree_next(rtfence, start, last);
1000 	}
1001 
1002 	return 0;
1003 }
1004 
1005 static int xe_pt_pre_commit(struct xe_migrate_pt_update *pt_update)
1006 {
1007 	struct xe_range_fence_tree *rftree =
1008 		&xe_vma_vm(pt_update->vma)->rftree[pt_update->tile_id];
1009 
1010 	return xe_pt_vm_dependencies(pt_update->job, rftree,
1011 				     pt_update->start, pt_update->last);
1012 }
1013 
1014 static int xe_pt_userptr_pre_commit(struct xe_migrate_pt_update *pt_update)
1015 {
1016 	struct xe_pt_migrate_pt_update *userptr_update =
1017 		container_of(pt_update, typeof(*userptr_update), base);
1018 	struct xe_userptr_vma *uvma = to_userptr_vma(pt_update->vma);
1019 	unsigned long notifier_seq = uvma->userptr.notifier_seq;
1020 	struct xe_vm *vm = xe_vma_vm(&uvma->vma);
1021 	int err = xe_pt_vm_dependencies(pt_update->job,
1022 					&vm->rftree[pt_update->tile_id],
1023 					pt_update->start,
1024 					pt_update->last);
1025 
1026 	if (err)
1027 		return err;
1028 
1029 	userptr_update->locked = false;
1030 
1031 	/*
1032 	 * Wait until nobody is running the invalidation notifier, and
1033 	 * since we're exiting the loop holding the notifier lock,
1034 	 * nobody can proceed invalidating either.
1035 	 *
1036 	 * Note that we don't update the vma->userptr.notifier_seq since
1037 	 * we don't update the userptr pages.
1038 	 */
1039 	do {
1040 		down_read(&vm->userptr.notifier_lock);
1041 		if (!mmu_interval_read_retry(&uvma->userptr.notifier,
1042 					     notifier_seq))
1043 			break;
1044 
1045 		up_read(&vm->userptr.notifier_lock);
1046 
1047 		if (userptr_update->bind)
1048 			return -EAGAIN;
1049 
1050 		notifier_seq = mmu_interval_read_begin(&uvma->userptr.notifier);
1051 	} while (true);
1052 
1053 	/* Inject errors to test_whether they are handled correctly */
1054 	if (userptr_update->bind && xe_pt_userptr_inject_eagain(uvma)) {
1055 		up_read(&vm->userptr.notifier_lock);
1056 		return -EAGAIN;
1057 	}
1058 
1059 	userptr_update->locked = true;
1060 
1061 	return 0;
1062 }
1063 
1064 static const struct xe_migrate_pt_update_ops bind_ops = {
1065 	.populate = xe_vm_populate_pgtable,
1066 	.pre_commit = xe_pt_pre_commit,
1067 };
1068 
1069 static const struct xe_migrate_pt_update_ops userptr_bind_ops = {
1070 	.populate = xe_vm_populate_pgtable,
1071 	.pre_commit = xe_pt_userptr_pre_commit,
1072 };
1073 
1074 struct invalidation_fence {
1075 	struct xe_gt_tlb_invalidation_fence base;
1076 	struct xe_gt *gt;
1077 	struct xe_vma *vma;
1078 	struct dma_fence *fence;
1079 	struct dma_fence_cb cb;
1080 	struct work_struct work;
1081 };
1082 
1083 static const char *
1084 invalidation_fence_get_driver_name(struct dma_fence *dma_fence)
1085 {
1086 	return "xe";
1087 }
1088 
1089 static const char *
1090 invalidation_fence_get_timeline_name(struct dma_fence *dma_fence)
1091 {
1092 	return "invalidation_fence";
1093 }
1094 
1095 static const struct dma_fence_ops invalidation_fence_ops = {
1096 	.get_driver_name = invalidation_fence_get_driver_name,
1097 	.get_timeline_name = invalidation_fence_get_timeline_name,
1098 };
1099 
1100 static void invalidation_fence_cb(struct dma_fence *fence,
1101 				  struct dma_fence_cb *cb)
1102 {
1103 	struct invalidation_fence *ifence =
1104 		container_of(cb, struct invalidation_fence, cb);
1105 
1106 	trace_xe_gt_tlb_invalidation_fence_cb(&ifence->base);
1107 	if (!ifence->fence->error) {
1108 		queue_work(system_wq, &ifence->work);
1109 	} else {
1110 		ifence->base.base.error = ifence->fence->error;
1111 		dma_fence_signal(&ifence->base.base);
1112 		dma_fence_put(&ifence->base.base);
1113 	}
1114 	dma_fence_put(ifence->fence);
1115 }
1116 
1117 static void invalidation_fence_work_func(struct work_struct *w)
1118 {
1119 	struct invalidation_fence *ifence =
1120 		container_of(w, struct invalidation_fence, work);
1121 
1122 	trace_xe_gt_tlb_invalidation_fence_work_func(&ifence->base);
1123 	xe_gt_tlb_invalidation_vma(ifence->gt, &ifence->base, ifence->vma);
1124 }
1125 
1126 static int invalidation_fence_init(struct xe_gt *gt,
1127 				   struct invalidation_fence *ifence,
1128 				   struct dma_fence *fence,
1129 				   struct xe_vma *vma)
1130 {
1131 	int ret;
1132 
1133 	trace_xe_gt_tlb_invalidation_fence_create(&ifence->base);
1134 
1135 	spin_lock_irq(&gt->tlb_invalidation.lock);
1136 	dma_fence_init(&ifence->base.base, &invalidation_fence_ops,
1137 		       &gt->tlb_invalidation.lock,
1138 		       dma_fence_context_alloc(1), 1);
1139 	spin_unlock_irq(&gt->tlb_invalidation.lock);
1140 
1141 	INIT_LIST_HEAD(&ifence->base.link);
1142 
1143 	dma_fence_get(&ifence->base.base);	/* Ref for caller */
1144 	ifence->fence = fence;
1145 	ifence->gt = gt;
1146 	ifence->vma = vma;
1147 
1148 	INIT_WORK(&ifence->work, invalidation_fence_work_func);
1149 	ret = dma_fence_add_callback(fence, &ifence->cb, invalidation_fence_cb);
1150 	if (ret == -ENOENT) {
1151 		dma_fence_put(ifence->fence);	/* Usually dropped in CB */
1152 		invalidation_fence_work_func(&ifence->work);
1153 	} else if (ret) {
1154 		dma_fence_put(&ifence->base.base);	/* Caller ref */
1155 		dma_fence_put(&ifence->base.base);	/* Creation ref */
1156 	}
1157 
1158 	xe_gt_assert(gt, !ret || ret == -ENOENT);
1159 
1160 	return ret && ret != -ENOENT ? ret : 0;
1161 }
1162 
1163 static void xe_pt_calc_rfence_interval(struct xe_vma *vma,
1164 				       struct xe_pt_migrate_pt_update *update,
1165 				       struct xe_vm_pgtable_update *entries,
1166 				       u32 num_entries)
1167 {
1168 	int i, level = 0;
1169 
1170 	for (i = 0; i < num_entries; i++) {
1171 		const struct xe_vm_pgtable_update *entry = &entries[i];
1172 
1173 		if (entry->pt->level > level)
1174 			level = entry->pt->level;
1175 	}
1176 
1177 	/* Greedy (non-optimal) calculation but simple */
1178 	update->base.start = ALIGN_DOWN(xe_vma_start(vma),
1179 					0x1ull << xe_pt_shift(level));
1180 	update->base.last = ALIGN(xe_vma_end(vma),
1181 				  0x1ull << xe_pt_shift(level)) - 1;
1182 }
1183 
1184 /**
1185  * __xe_pt_bind_vma() - Build and connect a page-table tree for the vma
1186  * address range.
1187  * @tile: The tile to bind for.
1188  * @vma: The vma to bind.
1189  * @q: The exec_queue with which to do pipelined page-table updates.
1190  * @syncs: Entries to sync on before binding the built tree to the live vm tree.
1191  * @num_syncs: Number of @sync entries.
1192  * @rebind: Whether we're rebinding this vma to the same address range without
1193  * an unbind in-between.
1194  *
1195  * This function builds a page-table tree (see xe_pt_stage_bind() for more
1196  * information on page-table building), and the xe_vm_pgtable_update entries
1197  * abstracting the operations needed to attach it to the main vm tree. It
1198  * then takes the relevant locks and updates the metadata side of the main
1199  * vm tree and submits the operations for pipelined attachment of the
1200  * gpu page-table to the vm main tree, (which can be done either by the
1201  * cpu and the GPU).
1202  *
1203  * Return: A valid dma-fence representing the pipelined attachment operation
1204  * on success, an error pointer on error.
1205  */
1206 struct dma_fence *
1207 __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue *q,
1208 		 struct xe_sync_entry *syncs, u32 num_syncs,
1209 		 bool rebind)
1210 {
1211 	struct xe_vm_pgtable_update entries[XE_VM_MAX_LEVEL * 2 + 1];
1212 	struct xe_pt_migrate_pt_update bind_pt_update = {
1213 		.base = {
1214 			.ops = xe_vma_is_userptr(vma) ? &userptr_bind_ops : &bind_ops,
1215 			.vma = vma,
1216 			.tile_id = tile->id,
1217 		},
1218 		.bind = true,
1219 	};
1220 	struct xe_vm *vm = xe_vma_vm(vma);
1221 	u32 num_entries;
1222 	struct dma_fence *fence;
1223 	struct invalidation_fence *ifence = NULL;
1224 	struct xe_range_fence *rfence;
1225 	int err;
1226 
1227 	bind_pt_update.locked = false;
1228 	xe_bo_assert_held(xe_vma_bo(vma));
1229 	xe_vm_assert_held(vm);
1230 
1231 	vm_dbg(&xe_vma_vm(vma)->xe->drm,
1232 	       "Preparing bind, with range [%llx...%llx) engine %p.\n",
1233 	       xe_vma_start(vma), xe_vma_end(vma), q);
1234 
1235 	err = xe_pt_prepare_bind(tile, vma, entries, &num_entries);
1236 	if (err)
1237 		goto err;
1238 
1239 	err = dma_resv_reserve_fences(xe_vm_resv(vm), 1);
1240 	if (!err && !xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm)
1241 		err = dma_resv_reserve_fences(xe_vma_bo(vma)->ttm.base.resv, 1);
1242 	if (err)
1243 		goto err;
1244 
1245 	xe_tile_assert(tile, num_entries <= ARRAY_SIZE(entries));
1246 
1247 	xe_vm_dbg_print_entries(tile_to_xe(tile), entries, num_entries);
1248 	xe_pt_calc_rfence_interval(vma, &bind_pt_update, entries,
1249 				   num_entries);
1250 
1251 	/*
1252 	 * If rebind, we have to invalidate TLB on !LR vms to invalidate
1253 	 * cached PTEs point to freed memory. on LR vms this is done
1254 	 * automatically when the context is re-enabled by the rebind worker,
1255 	 * or in fault mode it was invalidated on PTE zapping.
1256 	 *
1257 	 * If !rebind, and scratch enabled VMs, there is a chance the scratch
1258 	 * PTE is already cached in the TLB so it needs to be invalidated.
1259 	 * on !LR VMs this is done in the ring ops preceding a batch, but on
1260 	 * non-faulting LR, in particular on user-space batch buffer chaining,
1261 	 * it needs to be done here.
1262 	 */
1263 	if ((!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) {
1264 		ifence = kzalloc(sizeof(*ifence), GFP_KERNEL);
1265 		if (!ifence)
1266 			return ERR_PTR(-ENOMEM);
1267 	} else if (rebind && !xe_vm_in_lr_mode(vm)) {
1268 		/* We bump also if batch_invalidate_tlb is true */
1269 		vm->tlb_flush_seqno++;
1270 	}
1271 
1272 	rfence = kzalloc(sizeof(*rfence), GFP_KERNEL);
1273 	if (!rfence) {
1274 		kfree(ifence);
1275 		return ERR_PTR(-ENOMEM);
1276 	}
1277 
1278 	fence = xe_migrate_update_pgtables(tile->migrate,
1279 					   vm, xe_vma_bo(vma), q,
1280 					   entries, num_entries,
1281 					   syncs, num_syncs,
1282 					   &bind_pt_update.base);
1283 	if (!IS_ERR(fence)) {
1284 		bool last_munmap_rebind = vma->gpuva.flags & XE_VMA_LAST_REBIND;
1285 		LLIST_HEAD(deferred);
1286 		int err;
1287 
1288 		err = xe_range_fence_insert(&vm->rftree[tile->id], rfence,
1289 					    &xe_range_fence_kfree_ops,
1290 					    bind_pt_update.base.start,
1291 					    bind_pt_update.base.last, fence);
1292 		if (err)
1293 			dma_fence_wait(fence, false);
1294 
1295 		/* TLB invalidation must be done before signaling rebind */
1296 		if (ifence) {
1297 			int err = invalidation_fence_init(tile->primary_gt, ifence, fence,
1298 							  vma);
1299 			if (err) {
1300 				dma_fence_put(fence);
1301 				kfree(ifence);
1302 				return ERR_PTR(err);
1303 			}
1304 			fence = &ifence->base.base;
1305 		}
1306 
1307 		/* add shared fence now for pagetable delayed destroy */
1308 		dma_resv_add_fence(xe_vm_resv(vm), fence, rebind ||
1309 				   last_munmap_rebind ?
1310 				   DMA_RESV_USAGE_KERNEL :
1311 				   DMA_RESV_USAGE_BOOKKEEP);
1312 
1313 		if (!xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm)
1314 			dma_resv_add_fence(xe_vma_bo(vma)->ttm.base.resv, fence,
1315 					   DMA_RESV_USAGE_BOOKKEEP);
1316 		xe_pt_commit_bind(vma, entries, num_entries, rebind,
1317 				  bind_pt_update.locked ? &deferred : NULL);
1318 
1319 		/* This vma is live (again?) now */
1320 		vma->tile_present |= BIT(tile->id);
1321 
1322 		if (bind_pt_update.locked) {
1323 			to_userptr_vma(vma)->userptr.initial_bind = true;
1324 			up_read(&vm->userptr.notifier_lock);
1325 			xe_bo_put_commit(&deferred);
1326 		}
1327 		if (!rebind && last_munmap_rebind &&
1328 		    xe_vm_in_preempt_fence_mode(vm))
1329 			xe_vm_queue_rebind_worker(vm);
1330 	} else {
1331 		kfree(rfence);
1332 		kfree(ifence);
1333 		if (bind_pt_update.locked)
1334 			up_read(&vm->userptr.notifier_lock);
1335 		xe_pt_abort_bind(vma, entries, num_entries);
1336 	}
1337 
1338 	return fence;
1339 
1340 err:
1341 	return ERR_PTR(err);
1342 }
1343 
1344 struct xe_pt_stage_unbind_walk {
1345 	/** @base: The pagewalk base-class. */
1346 	struct xe_pt_walk base;
1347 
1348 	/* Input parameters for the walk */
1349 	/** @tile: The tile we're unbinding from. */
1350 	struct xe_tile *tile;
1351 
1352 	/**
1353 	 * @modified_start: Walk range start, modified to include any
1354 	 * shared pagetables that we're the only user of and can thus
1355 	 * treat as private.
1356 	 */
1357 	u64 modified_start;
1358 	/** @modified_end: Walk range start, modified like @modified_start. */
1359 	u64 modified_end;
1360 
1361 	/* Output */
1362 	/* @wupd: Structure to track the page-table updates we're building */
1363 	struct xe_walk_update wupd;
1364 };
1365 
1366 /*
1367  * Check whether this range is the only one populating this pagetable,
1368  * and in that case, update the walk range checks so that higher levels don't
1369  * view us as a shared pagetable.
1370  */
1371 static bool xe_pt_check_kill(u64 addr, u64 next, unsigned int level,
1372 			     const struct xe_pt *child,
1373 			     enum page_walk_action *action,
1374 			     struct xe_pt_walk *walk)
1375 {
1376 	struct xe_pt_stage_unbind_walk *xe_walk =
1377 		container_of(walk, typeof(*xe_walk), base);
1378 	unsigned int shift = walk->shifts[level];
1379 	u64 size = 1ull << shift;
1380 
1381 	if (IS_ALIGNED(addr, size) && IS_ALIGNED(next, size) &&
1382 	    ((next - addr) >> shift) == child->num_live) {
1383 		u64 size = 1ull << walk->shifts[level + 1];
1384 
1385 		*action = ACTION_CONTINUE;
1386 
1387 		if (xe_walk->modified_start >= addr)
1388 			xe_walk->modified_start = round_down(addr, size);
1389 		if (xe_walk->modified_end <= next)
1390 			xe_walk->modified_end = round_up(next, size);
1391 
1392 		return true;
1393 	}
1394 
1395 	return false;
1396 }
1397 
1398 static int xe_pt_stage_unbind_entry(struct xe_ptw *parent, pgoff_t offset,
1399 				    unsigned int level, u64 addr, u64 next,
1400 				    struct xe_ptw **child,
1401 				    enum page_walk_action *action,
1402 				    struct xe_pt_walk *walk)
1403 {
1404 	struct xe_pt *xe_child = container_of(*child, typeof(*xe_child), base);
1405 
1406 	XE_WARN_ON(!*child);
1407 	XE_WARN_ON(!level && xe_child->is_compact);
1408 
1409 	xe_pt_check_kill(addr, next, level - 1, xe_child, action, walk);
1410 
1411 	return 0;
1412 }
1413 
1414 static int
1415 xe_pt_stage_unbind_post_descend(struct xe_ptw *parent, pgoff_t offset,
1416 				unsigned int level, u64 addr, u64 next,
1417 				struct xe_ptw **child,
1418 				enum page_walk_action *action,
1419 				struct xe_pt_walk *walk)
1420 {
1421 	struct xe_pt_stage_unbind_walk *xe_walk =
1422 		container_of(walk, typeof(*xe_walk), base);
1423 	struct xe_pt *xe_child = container_of(*child, typeof(*xe_child), base);
1424 	pgoff_t end_offset;
1425 	u64 size = 1ull << walk->shifts[--level];
1426 
1427 	if (!IS_ALIGNED(addr, size))
1428 		addr = xe_walk->modified_start;
1429 	if (!IS_ALIGNED(next, size))
1430 		next = xe_walk->modified_end;
1431 
1432 	/* Parent == *child is the root pt. Don't kill it. */
1433 	if (parent != *child &&
1434 	    xe_pt_check_kill(addr, next, level, xe_child, action, walk))
1435 		return 0;
1436 
1437 	if (!xe_pt_nonshared_offsets(addr, next, level, walk, action, &offset,
1438 				     &end_offset))
1439 		return 0;
1440 
1441 	(void)xe_pt_new_shared(&xe_walk->wupd, xe_child, offset, false);
1442 	xe_walk->wupd.updates[level].update->qwords = end_offset - offset;
1443 
1444 	return 0;
1445 }
1446 
1447 static const struct xe_pt_walk_ops xe_pt_stage_unbind_ops = {
1448 	.pt_entry = xe_pt_stage_unbind_entry,
1449 	.pt_post_descend = xe_pt_stage_unbind_post_descend,
1450 };
1451 
1452 /**
1453  * xe_pt_stage_unbind() - Build page-table update structures for an unbind
1454  * operation
1455  * @tile: The tile we're unbinding for.
1456  * @vma: The vma we're unbinding.
1457  * @entries: Caller-provided storage for the update structures.
1458  *
1459  * Builds page-table update structures for an unbind operation. The function
1460  * will attempt to remove all page-tables that we're the only user
1461  * of, and for that to work, the unbind operation must be committed in the
1462  * same critical section that blocks racing binds to the same page-table tree.
1463  *
1464  * Return: The number of entries used.
1465  */
1466 static unsigned int xe_pt_stage_unbind(struct xe_tile *tile, struct xe_vma *vma,
1467 				       struct xe_vm_pgtable_update *entries)
1468 {
1469 	struct xe_pt_stage_unbind_walk xe_walk = {
1470 		.base = {
1471 			.ops = &xe_pt_stage_unbind_ops,
1472 			.shifts = xe_normal_pt_shifts,
1473 			.max_level = XE_PT_HIGHEST_LEVEL,
1474 		},
1475 		.tile = tile,
1476 		.modified_start = xe_vma_start(vma),
1477 		.modified_end = xe_vma_end(vma),
1478 		.wupd.entries = entries,
1479 	};
1480 	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
1481 
1482 	(void)xe_pt_walk_shared(&pt->base, pt->level, xe_vma_start(vma),
1483 				xe_vma_end(vma), &xe_walk.base);
1484 
1485 	return xe_walk.wupd.num_used_entries;
1486 }
1487 
1488 static void
1489 xe_migrate_clear_pgtable_callback(struct xe_migrate_pt_update *pt_update,
1490 				  struct xe_tile *tile, struct iosys_map *map,
1491 				  void *ptr, u32 qword_ofs, u32 num_qwords,
1492 				  const struct xe_vm_pgtable_update *update)
1493 {
1494 	struct xe_vma *vma = pt_update->vma;
1495 	u64 empty = __xe_pt_empty_pte(tile, xe_vma_vm(vma), update->pt->level);
1496 	int i;
1497 
1498 	if (map && map->is_iomem)
1499 		for (i = 0; i < num_qwords; ++i)
1500 			xe_map_wr(tile_to_xe(tile), map, (qword_ofs + i) *
1501 				  sizeof(u64), u64, empty);
1502 	else if (map)
1503 		memset64(map->vaddr + qword_ofs * sizeof(u64), empty,
1504 			 num_qwords);
1505 	else
1506 		memset64(ptr, empty, num_qwords);
1507 }
1508 
1509 static void
1510 xe_pt_commit_unbind(struct xe_vma *vma,
1511 		    struct xe_vm_pgtable_update *entries, u32 num_entries,
1512 		    struct llist_head *deferred)
1513 {
1514 	u32 j;
1515 
1516 	xe_pt_commit_locks_assert(vma);
1517 
1518 	for (j = 0; j < num_entries; ++j) {
1519 		struct xe_vm_pgtable_update *entry = &entries[j];
1520 		struct xe_pt *pt = entry->pt;
1521 
1522 		pt->num_live -= entry->qwords;
1523 		if (pt->level) {
1524 			struct xe_pt_dir *pt_dir = as_xe_pt_dir(pt);
1525 			u32 i;
1526 
1527 			for (i = entry->ofs; i < entry->ofs + entry->qwords;
1528 			     i++) {
1529 				if (xe_pt_entry(pt_dir, i))
1530 					xe_pt_destroy(xe_pt_entry(pt_dir, i),
1531 						      xe_vma_vm(vma)->flags, deferred);
1532 
1533 				pt_dir->children[i] = NULL;
1534 			}
1535 		}
1536 	}
1537 }
1538 
1539 static const struct xe_migrate_pt_update_ops unbind_ops = {
1540 	.populate = xe_migrate_clear_pgtable_callback,
1541 	.pre_commit = xe_pt_pre_commit,
1542 };
1543 
1544 static const struct xe_migrate_pt_update_ops userptr_unbind_ops = {
1545 	.populate = xe_migrate_clear_pgtable_callback,
1546 	.pre_commit = xe_pt_userptr_pre_commit,
1547 };
1548 
1549 /**
1550  * __xe_pt_unbind_vma() - Disconnect and free a page-table tree for the vma
1551  * address range.
1552  * @tile: The tile to unbind for.
1553  * @vma: The vma to unbind.
1554  * @q: The exec_queue with which to do pipelined page-table updates.
1555  * @syncs: Entries to sync on before disconnecting the tree to be destroyed.
1556  * @num_syncs: Number of @sync entries.
1557  *
1558  * This function builds a the xe_vm_pgtable_update entries abstracting the
1559  * operations needed to detach the page-table tree to be destroyed from the
1560  * man vm tree.
1561  * It then takes the relevant locks and submits the operations for
1562  * pipelined detachment of the gpu page-table from  the vm main tree,
1563  * (which can be done either by the cpu and the GPU), Finally it frees the
1564  * detached page-table tree.
1565  *
1566  * Return: A valid dma-fence representing the pipelined detachment operation
1567  * on success, an error pointer on error.
1568  */
1569 struct dma_fence *
1570 __xe_pt_unbind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue *q,
1571 		   struct xe_sync_entry *syncs, u32 num_syncs)
1572 {
1573 	struct xe_vm_pgtable_update entries[XE_VM_MAX_LEVEL * 2 + 1];
1574 	struct xe_pt_migrate_pt_update unbind_pt_update = {
1575 		.base = {
1576 			.ops = xe_vma_is_userptr(vma) ? &userptr_unbind_ops :
1577 			&unbind_ops,
1578 			.vma = vma,
1579 			.tile_id = tile->id,
1580 		},
1581 	};
1582 	struct xe_vm *vm = xe_vma_vm(vma);
1583 	u32 num_entries;
1584 	struct dma_fence *fence = NULL;
1585 	struct invalidation_fence *ifence;
1586 	struct xe_range_fence *rfence;
1587 	int err;
1588 
1589 	LLIST_HEAD(deferred);
1590 
1591 	xe_bo_assert_held(xe_vma_bo(vma));
1592 	xe_vm_assert_held(vm);
1593 
1594 	vm_dbg(&xe_vma_vm(vma)->xe->drm,
1595 	       "Preparing unbind, with range [%llx...%llx) engine %p.\n",
1596 	       xe_vma_start(vma), xe_vma_end(vma), q);
1597 
1598 	num_entries = xe_pt_stage_unbind(tile, vma, entries);
1599 	xe_tile_assert(tile, num_entries <= ARRAY_SIZE(entries));
1600 
1601 	xe_vm_dbg_print_entries(tile_to_xe(tile), entries, num_entries);
1602 	xe_pt_calc_rfence_interval(vma, &unbind_pt_update, entries,
1603 				   num_entries);
1604 
1605 	err = dma_resv_reserve_fences(xe_vm_resv(vm), 1);
1606 	if (!err && !xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm)
1607 		err = dma_resv_reserve_fences(xe_vma_bo(vma)->ttm.base.resv, 1);
1608 	if (err)
1609 		return ERR_PTR(err);
1610 
1611 	ifence = kzalloc(sizeof(*ifence), GFP_KERNEL);
1612 	if (!ifence)
1613 		return ERR_PTR(-ENOMEM);
1614 
1615 	rfence = kzalloc(sizeof(*rfence), GFP_KERNEL);
1616 	if (!rfence) {
1617 		kfree(ifence);
1618 		return ERR_PTR(-ENOMEM);
1619 	}
1620 
1621 	/*
1622 	 * Even if we were already evicted and unbind to destroy, we need to
1623 	 * clear again here. The eviction may have updated pagetables at a
1624 	 * lower level, because it needs to be more conservative.
1625 	 */
1626 	fence = xe_migrate_update_pgtables(tile->migrate,
1627 					   vm, NULL, q ? q :
1628 					   vm->q[tile->id],
1629 					   entries, num_entries,
1630 					   syncs, num_syncs,
1631 					   &unbind_pt_update.base);
1632 	if (!IS_ERR(fence)) {
1633 		int err;
1634 
1635 		err = xe_range_fence_insert(&vm->rftree[tile->id], rfence,
1636 					    &xe_range_fence_kfree_ops,
1637 					    unbind_pt_update.base.start,
1638 					    unbind_pt_update.base.last, fence);
1639 		if (err)
1640 			dma_fence_wait(fence, false);
1641 
1642 		/* TLB invalidation must be done before signaling unbind */
1643 		err = invalidation_fence_init(tile->primary_gt, ifence, fence, vma);
1644 		if (err) {
1645 			dma_fence_put(fence);
1646 			kfree(ifence);
1647 			return ERR_PTR(err);
1648 		}
1649 		fence = &ifence->base.base;
1650 
1651 		/* add shared fence now for pagetable delayed destroy */
1652 		dma_resv_add_fence(xe_vm_resv(vm), fence,
1653 				   DMA_RESV_USAGE_BOOKKEEP);
1654 
1655 		/* This fence will be installed by caller when doing eviction */
1656 		if (!xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm)
1657 			dma_resv_add_fence(xe_vma_bo(vma)->ttm.base.resv, fence,
1658 					   DMA_RESV_USAGE_BOOKKEEP);
1659 		xe_pt_commit_unbind(vma, entries, num_entries,
1660 				    unbind_pt_update.locked ? &deferred : NULL);
1661 		vma->tile_present &= ~BIT(tile->id);
1662 	} else {
1663 		kfree(rfence);
1664 		kfree(ifence);
1665 	}
1666 
1667 	if (!vma->tile_present)
1668 		list_del_init(&vma->combined_links.rebind);
1669 
1670 	if (unbind_pt_update.locked) {
1671 		xe_tile_assert(tile, xe_vma_is_userptr(vma));
1672 
1673 		if (!vma->tile_present) {
1674 			spin_lock(&vm->userptr.invalidated_lock);
1675 			list_del_init(&to_userptr_vma(vma)->userptr.invalidate_link);
1676 			spin_unlock(&vm->userptr.invalidated_lock);
1677 		}
1678 		up_read(&vm->userptr.notifier_lock);
1679 		xe_bo_put_commit(&deferred);
1680 	}
1681 
1682 	return fence;
1683 }
1684