xref: /linux/drivers/gpu/drm/xe/xe_bo.c (revision c3fb1fb9e65fa6a108b4d19c61bdcb47fd4fe180)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_bo.h"
7 
8 #include <linux/dma-buf.h>
9 #include <linux/nospec.h>
10 
11 #include <drm/drm_drv.h>
12 #include <drm/drm_dumb_buffers.h>
13 #include <drm/drm_gem_ttm_helper.h>
14 #include <drm/drm_managed.h>
15 #include <drm/ttm/ttm_backup.h>
16 #include <drm/ttm/ttm_device.h>
17 #include <drm/ttm/ttm_placement.h>
18 #include <drm/ttm/ttm_tt.h>
19 #include <uapi/drm/xe_drm.h>
20 
21 #include <kunit/static_stub.h>
22 
23 #include <trace/events/gpu_mem.h>
24 
25 #include "xe_device.h"
26 #include "xe_dma_buf.h"
27 #include "xe_drm_client.h"
28 #include "xe_ggtt.h"
29 #include "xe_map.h"
30 #include "xe_migrate.h"
31 #include "xe_pat.h"
32 #include "xe_pm.h"
33 #include "xe_preempt_fence.h"
34 #include "xe_pxp.h"
35 #include "xe_res_cursor.h"
36 #include "xe_shrinker.h"
37 #include "xe_sriov_vf_ccs.h"
38 #include "xe_tile.h"
39 #include "xe_trace_bo.h"
40 #include "xe_ttm_stolen_mgr.h"
41 #include "xe_vm.h"
42 #include "xe_vram_types.h"
43 
44 const char *const xe_mem_type_to_name[TTM_NUM_MEM_TYPES]  = {
45 	[XE_PL_SYSTEM] = "system",
46 	[XE_PL_TT] = "gtt",
47 	[XE_PL_VRAM0] = "vram0",
48 	[XE_PL_VRAM1] = "vram1",
49 	[XE_PL_STOLEN] = "stolen"
50 };
51 
52 static const struct ttm_place sys_placement_flags = {
53 	.fpfn = 0,
54 	.lpfn = 0,
55 	.mem_type = XE_PL_SYSTEM,
56 	.flags = 0,
57 };
58 
59 static struct ttm_placement sys_placement = {
60 	.num_placement = 1,
61 	.placement = &sys_placement_flags,
62 };
63 
64 static struct ttm_placement purge_placement;
65 
66 static const struct ttm_place tt_placement_flags[] = {
67 	{
68 		.fpfn = 0,
69 		.lpfn = 0,
70 		.mem_type = XE_PL_TT,
71 		.flags = TTM_PL_FLAG_DESIRED,
72 	},
73 	{
74 		.fpfn = 0,
75 		.lpfn = 0,
76 		.mem_type = XE_PL_SYSTEM,
77 		.flags = TTM_PL_FLAG_FALLBACK,
78 	}
79 };
80 
81 static struct ttm_placement tt_placement = {
82 	.num_placement = 2,
83 	.placement = tt_placement_flags,
84 };
85 
86 #define for_each_set_bo_vram_flag(bit__, bo_flags__) \
87 	for (unsigned int __bit_tmp = BIT(0); __bit_tmp <= XE_BO_FLAG_VRAM_MASK; __bit_tmp <<= 1) \
88 		for_each_if(((bit__) = __bit_tmp) & (bo_flags__) & XE_BO_FLAG_VRAM_MASK)
89 
90 bool mem_type_is_vram(u32 mem_type)
91 {
92 	return mem_type >= XE_PL_VRAM0 && mem_type != XE_PL_STOLEN;
93 }
94 
95 static bool resource_is_stolen_vram(struct xe_device *xe, struct ttm_resource *res)
96 {
97 	return res->mem_type == XE_PL_STOLEN && IS_DGFX(xe);
98 }
99 
100 static bool resource_is_vram(struct ttm_resource *res)
101 {
102 	return mem_type_is_vram(res->mem_type);
103 }
104 
105 bool xe_bo_is_vram(struct xe_bo *bo)
106 {
107 	return resource_is_vram(bo->ttm.resource) ||
108 		resource_is_stolen_vram(xe_bo_device(bo), bo->ttm.resource);
109 }
110 
111 bool xe_bo_is_stolen(struct xe_bo *bo)
112 {
113 	return bo->ttm.resource->mem_type == XE_PL_STOLEN;
114 }
115 
116 /**
117  * xe_bo_has_single_placement - check if BO is placed only in one memory location
118  * @bo: The BO
119  *
120  * This function checks whether a given BO is placed in only one memory location.
121  *
122  * Returns: true if the BO is placed in a single memory location, false otherwise.
123  *
124  */
125 bool xe_bo_has_single_placement(struct xe_bo *bo)
126 {
127 	return bo->placement.num_placement == 1;
128 }
129 
130 /**
131  * xe_bo_is_stolen_devmem - check if BO is of stolen type accessed via PCI BAR
132  * @bo: The BO
133  *
134  * The stolen memory is accessed through the PCI BAR for both DGFX and some
135  * integrated platforms that have a dedicated bit in the PTE for devmem (DM).
136  *
137  * Returns: true if it's stolen memory accessed via PCI BAR, false otherwise.
138  */
139 bool xe_bo_is_stolen_devmem(struct xe_bo *bo)
140 {
141 	return xe_bo_is_stolen(bo) &&
142 		GRAPHICS_VERx100(xe_bo_device(bo)) >= 1270;
143 }
144 
145 /**
146  * xe_bo_is_vm_bound - check if BO has any mappings through VM_BIND
147  * @bo: The BO
148  *
149  * Check if a given bo is bound through VM_BIND. This requires the
150  * reservation lock for the BO to be held.
151  *
152  * Returns: boolean
153  */
154 bool xe_bo_is_vm_bound(struct xe_bo *bo)
155 {
156 	xe_bo_assert_held(bo);
157 
158 	return !list_empty(&bo->ttm.base.gpuva.list);
159 }
160 
161 static bool xe_bo_is_user(struct xe_bo *bo)
162 {
163 	return bo->flags & XE_BO_FLAG_USER;
164 }
165 
166 static struct xe_migrate *
167 mem_type_to_migrate(struct xe_device *xe, u32 mem_type)
168 {
169 	struct xe_tile *tile;
170 
171 	xe_assert(xe, mem_type == XE_PL_STOLEN || mem_type_is_vram(mem_type));
172 	tile = &xe->tiles[mem_type == XE_PL_STOLEN ? 0 : (mem_type - XE_PL_VRAM0)];
173 	return tile->migrate;
174 }
175 
176 static void try_add_system(struct xe_device *xe, struct xe_bo *bo,
177 			   u32 bo_flags, u32 *c)
178 {
179 	if (bo_flags & XE_BO_FLAG_SYSTEM) {
180 		xe_assert(xe, *c < ARRAY_SIZE(bo->placements));
181 
182 		bo->placements[*c] = (struct ttm_place) {
183 			.mem_type = XE_PL_TT,
184 			.flags = (bo_flags & XE_BO_FLAG_VRAM_MASK) ?
185 			TTM_PL_FLAG_FALLBACK : 0,
186 		};
187 		*c += 1;
188 	}
189 }
190 
191 static bool force_contiguous(u32 bo_flags)
192 {
193 	if (bo_flags & XE_BO_FLAG_STOLEN)
194 		return true; /* users expect this */
195 	else if (bo_flags & XE_BO_FLAG_PINNED &&
196 		 !(bo_flags & XE_BO_FLAG_PINNED_LATE_RESTORE))
197 		return true; /* needs vmap */
198 	else if (bo_flags & XE_BO_FLAG_CPU_ADDR_MIRROR)
199 		return true;
200 
201 	/*
202 	 * For eviction / restore on suspend / resume objects pinned in VRAM
203 	 * must be contiguous, also only contiguous BOs support xe_bo_vmap.
204 	 */
205 	return bo_flags & XE_BO_FLAG_NEEDS_CPU_ACCESS &&
206 	       bo_flags & XE_BO_FLAG_PINNED;
207 }
208 
209 static u8 vram_bo_flag_to_tile_id(struct xe_device *xe, u32 vram_bo_flag)
210 {
211 	xe_assert(xe, vram_bo_flag & XE_BO_FLAG_VRAM_MASK);
212 	xe_assert(xe, (vram_bo_flag & (vram_bo_flag - 1)) == 0);
213 
214 	return __ffs(vram_bo_flag >> (__ffs(XE_BO_FLAG_VRAM0) - 1)) - 1;
215 }
216 
217 static u32 bo_vram_flags_to_vram_placement(struct xe_device *xe, u32 bo_flags, u32 vram_flag,
218 					   enum ttm_bo_type type)
219 {
220 	u8 tile_id = vram_bo_flag_to_tile_id(xe, vram_flag);
221 
222 	xe_assert(xe, tile_id < xe->info.tile_count);
223 
224 	if (type == ttm_bo_type_kernel && !(bo_flags & XE_BO_FLAG_FORCE_USER_VRAM))
225 		return xe->tiles[tile_id].mem.kernel_vram->placement;
226 	else
227 		return xe->tiles[tile_id].mem.vram->placement;
228 }
229 
230 static void add_vram(struct xe_device *xe, struct xe_bo *bo,
231 		     struct ttm_place *places, u32 bo_flags, u32 mem_type, u32 *c)
232 {
233 	struct ttm_place place = { .mem_type = mem_type };
234 	struct ttm_resource_manager *mgr = ttm_manager_type(&xe->ttm, mem_type);
235 	struct xe_ttm_vram_mgr *vram_mgr = to_xe_ttm_vram_mgr(mgr);
236 
237 	struct xe_vram_region *vram;
238 	u64 io_size;
239 
240 	xe_assert(xe, *c < ARRAY_SIZE(bo->placements));
241 
242 	vram = container_of(vram_mgr, struct xe_vram_region, ttm);
243 	xe_assert(xe, vram && vram->usable_size);
244 	io_size = vram->io_size;
245 
246 	if (force_contiguous(bo_flags))
247 		place.flags |= TTM_PL_FLAG_CONTIGUOUS;
248 
249 	if (io_size < vram->usable_size) {
250 		if (bo_flags & XE_BO_FLAG_NEEDS_CPU_ACCESS) {
251 			place.fpfn = 0;
252 			place.lpfn = io_size >> PAGE_SHIFT;
253 		} else {
254 			place.flags |= TTM_PL_FLAG_TOPDOWN;
255 		}
256 	}
257 	places[*c] = place;
258 	*c += 1;
259 }
260 
261 static void try_add_vram(struct xe_device *xe, struct xe_bo *bo,
262 			 u32 bo_flags, enum ttm_bo_type type, u32 *c)
263 {
264 	u32 vram_flag;
265 
266 	for_each_set_bo_vram_flag(vram_flag, bo_flags) {
267 		u32 pl = bo_vram_flags_to_vram_placement(xe, bo_flags, vram_flag, type);
268 
269 		add_vram(xe, bo, bo->placements, bo_flags, pl, c);
270 	}
271 }
272 
273 static void try_add_stolen(struct xe_device *xe, struct xe_bo *bo,
274 			   u32 bo_flags, u32 *c)
275 {
276 	if (bo_flags & XE_BO_FLAG_STOLEN) {
277 		xe_assert(xe, *c < ARRAY_SIZE(bo->placements));
278 
279 		bo->placements[*c] = (struct ttm_place) {
280 			.mem_type = XE_PL_STOLEN,
281 			.flags = force_contiguous(bo_flags) ?
282 				TTM_PL_FLAG_CONTIGUOUS : 0,
283 		};
284 		*c += 1;
285 	}
286 }
287 
288 static int __xe_bo_placement_for_flags(struct xe_device *xe, struct xe_bo *bo,
289 				       u32 bo_flags, enum ttm_bo_type type)
290 {
291 	u32 c = 0;
292 
293 	try_add_vram(xe, bo, bo_flags, type, &c);
294 	try_add_system(xe, bo, bo_flags, &c);
295 	try_add_stolen(xe, bo, bo_flags, &c);
296 
297 	if (!c)
298 		return -EINVAL;
299 
300 	bo->placement = (struct ttm_placement) {
301 		.num_placement = c,
302 		.placement = bo->placements,
303 	};
304 
305 	return 0;
306 }
307 
308 int xe_bo_placement_for_flags(struct xe_device *xe, struct xe_bo *bo,
309 			      u32 bo_flags, enum ttm_bo_type type)
310 {
311 	xe_bo_assert_held(bo);
312 	return __xe_bo_placement_for_flags(xe, bo, bo_flags, type);
313 }
314 
315 static void xe_evict_flags(struct ttm_buffer_object *tbo,
316 			   struct ttm_placement *placement)
317 {
318 	struct xe_device *xe = container_of(tbo->bdev, typeof(*xe), ttm);
319 	bool device_unplugged = drm_dev_is_unplugged(&xe->drm);
320 	struct xe_bo *bo;
321 
322 	if (!xe_bo_is_xe_bo(tbo)) {
323 		/* Don't handle scatter gather BOs */
324 		if (tbo->type == ttm_bo_type_sg) {
325 			placement->num_placement = 0;
326 			return;
327 		}
328 
329 		*placement = device_unplugged ? purge_placement : sys_placement;
330 		return;
331 	}
332 
333 	bo = ttm_to_xe_bo(tbo);
334 	if (bo->flags & XE_BO_FLAG_CPU_ADDR_MIRROR) {
335 		*placement = sys_placement;
336 		return;
337 	}
338 
339 	if (device_unplugged && !tbo->base.dma_buf) {
340 		*placement = purge_placement;
341 		return;
342 	}
343 
344 	/*
345 	 * For xe, sg bos that are evicted to system just triggers a
346 	 * rebind of the sg list upon subsequent validation to XE_PL_TT.
347 	 */
348 	switch (tbo->resource->mem_type) {
349 	case XE_PL_VRAM0:
350 	case XE_PL_VRAM1:
351 	case XE_PL_STOLEN:
352 		*placement = tt_placement;
353 		break;
354 	case XE_PL_TT:
355 	default:
356 		*placement = sys_placement;
357 		break;
358 	}
359 }
360 
361 /* struct xe_ttm_tt - Subclassed ttm_tt for xe */
362 struct xe_ttm_tt {
363 	struct ttm_tt ttm;
364 	struct sg_table sgt;
365 	struct sg_table *sg;
366 	/** @purgeable: Whether the content of the pages of @ttm is purgeable. */
367 	bool purgeable;
368 };
369 
370 static int xe_tt_map_sg(struct xe_device *xe, struct ttm_tt *tt)
371 {
372 	struct xe_ttm_tt *xe_tt = container_of(tt, struct xe_ttm_tt, ttm);
373 	unsigned long num_pages = tt->num_pages;
374 	int ret;
375 
376 	XE_WARN_ON((tt->page_flags & TTM_TT_FLAG_EXTERNAL) &&
377 		   !(tt->page_flags & TTM_TT_FLAG_EXTERNAL_MAPPABLE));
378 
379 	if (xe_tt->sg)
380 		return 0;
381 
382 	ret = sg_alloc_table_from_pages_segment(&xe_tt->sgt, tt->pages,
383 						num_pages, 0,
384 						(u64)num_pages << PAGE_SHIFT,
385 						xe_sg_segment_size(xe->drm.dev),
386 						GFP_KERNEL);
387 	if (ret)
388 		return ret;
389 
390 	xe_tt->sg = &xe_tt->sgt;
391 	ret = dma_map_sgtable(xe->drm.dev, xe_tt->sg, DMA_BIDIRECTIONAL,
392 			      DMA_ATTR_SKIP_CPU_SYNC);
393 	if (ret) {
394 		sg_free_table(xe_tt->sg);
395 		xe_tt->sg = NULL;
396 		return ret;
397 	}
398 
399 	return 0;
400 }
401 
402 static void xe_tt_unmap_sg(struct xe_device *xe, struct ttm_tt *tt)
403 {
404 	struct xe_ttm_tt *xe_tt = container_of(tt, struct xe_ttm_tt, ttm);
405 
406 	if (xe_tt->sg) {
407 		dma_unmap_sgtable(xe->drm.dev, xe_tt->sg,
408 				  DMA_BIDIRECTIONAL, 0);
409 		sg_free_table(xe_tt->sg);
410 		xe_tt->sg = NULL;
411 	}
412 }
413 
414 struct sg_table *xe_bo_sg(struct xe_bo *bo)
415 {
416 	struct ttm_tt *tt = bo->ttm.ttm;
417 	struct xe_ttm_tt *xe_tt = container_of(tt, struct xe_ttm_tt, ttm);
418 
419 	return xe_tt->sg;
420 }
421 
422 /*
423  * Account ttm pages against the device shrinker's shrinkable and
424  * purgeable counts.
425  */
426 static void xe_ttm_tt_account_add(struct xe_device *xe, struct ttm_tt *tt)
427 {
428 	struct xe_ttm_tt *xe_tt = container_of(tt, struct xe_ttm_tt, ttm);
429 
430 	if (xe_tt->purgeable)
431 		xe_shrinker_mod_pages(xe->mem.shrinker, 0, tt->num_pages);
432 	else
433 		xe_shrinker_mod_pages(xe->mem.shrinker, tt->num_pages, 0);
434 }
435 
436 static void xe_ttm_tt_account_subtract(struct xe_device *xe, struct ttm_tt *tt)
437 {
438 	struct xe_ttm_tt *xe_tt = container_of(tt, struct xe_ttm_tt, ttm);
439 
440 	if (xe_tt->purgeable)
441 		xe_shrinker_mod_pages(xe->mem.shrinker, 0, -(long)tt->num_pages);
442 	else
443 		xe_shrinker_mod_pages(xe->mem.shrinker, -(long)tt->num_pages, 0);
444 }
445 
446 static void update_global_total_pages(struct ttm_device *ttm_dev,
447 				      long num_pages)
448 {
449 #if IS_ENABLED(CONFIG_TRACE_GPU_MEM)
450 	struct xe_device *xe = ttm_to_xe_device(ttm_dev);
451 	u64 global_total_pages =
452 		atomic64_add_return(num_pages, &xe->global_total_pages);
453 
454 	trace_gpu_mem_total(xe->drm.primary->index, 0,
455 			    global_total_pages << PAGE_SHIFT);
456 #endif
457 }
458 
459 static struct ttm_tt *xe_ttm_tt_create(struct ttm_buffer_object *ttm_bo,
460 				       u32 page_flags)
461 {
462 	struct xe_bo *bo = ttm_to_xe_bo(ttm_bo);
463 	struct xe_device *xe = xe_bo_device(bo);
464 	struct xe_ttm_tt *xe_tt;
465 	struct ttm_tt *tt;
466 	unsigned long extra_pages;
467 	enum ttm_caching caching = ttm_cached;
468 	int err;
469 
470 	xe_tt = kzalloc_obj(*xe_tt);
471 	if (!xe_tt)
472 		return NULL;
473 
474 	tt = &xe_tt->ttm;
475 
476 	extra_pages = 0;
477 	if (xe_bo_needs_ccs_pages(bo))
478 		extra_pages = DIV_ROUND_UP(xe_device_ccs_bytes(xe, xe_bo_size(bo)),
479 					   PAGE_SIZE);
480 
481 	/*
482 	 * DGFX system memory is always WB / ttm_cached, since
483 	 * other caching modes are only supported on x86. DGFX
484 	 * GPU system memory accesses are always coherent with the
485 	 * CPU.
486 	 */
487 	if (!IS_DGFX(xe)) {
488 		switch (bo->cpu_caching) {
489 		case DRM_XE_GEM_CPU_CACHING_WC:
490 			caching = ttm_write_combined;
491 			break;
492 		default:
493 			caching = ttm_cached;
494 			break;
495 		}
496 
497 		WARN_ON((bo->flags & XE_BO_FLAG_USER) && !bo->cpu_caching);
498 
499 		/*
500 		 * For Xe_LPG and beyond up to NVL-P (excluding), PPGTT PTE
501 		 * lookups are also non-coherent and require a CPU:WC mapping.
502 		 */
503 		if ((!bo->cpu_caching && bo->flags & XE_BO_FLAG_FORCE_WC) ||
504 		    (!xe->info.has_cached_pt && bo->flags & XE_BO_FLAG_PAGETABLE))
505 			caching = ttm_write_combined;
506 	}
507 
508 	if (bo->flags & XE_BO_FLAG_NEEDS_UC) {
509 		/*
510 		 * Valid only for internally-created buffers only, for
511 		 * which cpu_caching is never initialized.
512 		 */
513 		xe_assert(xe, bo->cpu_caching == 0);
514 		caching = ttm_uncached;
515 	}
516 
517 	if (ttm_bo->type != ttm_bo_type_sg)
518 		page_flags |= TTM_TT_FLAG_EXTERNAL | TTM_TT_FLAG_EXTERNAL_MAPPABLE;
519 
520 	err = ttm_tt_init(tt, &bo->ttm, page_flags, caching, extra_pages);
521 	if (err) {
522 		kfree(xe_tt);
523 		return NULL;
524 	}
525 
526 	if (ttm_bo->type != ttm_bo_type_sg) {
527 		err = ttm_tt_setup_backup(tt);
528 		if (err) {
529 			ttm_tt_fini(tt);
530 			kfree(xe_tt);
531 			return NULL;
532 		}
533 	}
534 
535 	return tt;
536 }
537 
538 static int xe_ttm_tt_populate(struct ttm_device *ttm_dev, struct ttm_tt *tt,
539 			      struct ttm_operation_ctx *ctx)
540 {
541 	struct xe_ttm_tt *xe_tt = container_of(tt, struct xe_ttm_tt, ttm);
542 	int err;
543 
544 	/*
545 	 * dma-bufs are not populated with pages, and the dma-
546 	 * addresses are set up when moved to XE_PL_TT.
547 	 */
548 	if ((tt->page_flags & TTM_TT_FLAG_EXTERNAL) &&
549 	    !(tt->page_flags & TTM_TT_FLAG_EXTERNAL_MAPPABLE))
550 		return 0;
551 
552 	if (ttm_tt_is_backed_up(tt) && !xe_tt->purgeable) {
553 		err = ttm_tt_restore(ttm_dev, tt, ctx);
554 	} else {
555 		ttm_tt_clear_backed_up(tt);
556 		err = ttm_pool_alloc(&ttm_dev->pool, tt, ctx);
557 	}
558 	if (err)
559 		return err;
560 
561 	xe_tt->purgeable = false;
562 	xe_ttm_tt_account_add(ttm_to_xe_device(ttm_dev), tt);
563 	update_global_total_pages(ttm_dev, tt->num_pages);
564 
565 	return 0;
566 }
567 
568 static void xe_ttm_tt_unpopulate(struct ttm_device *ttm_dev, struct ttm_tt *tt)
569 {
570 	struct xe_device *xe = ttm_to_xe_device(ttm_dev);
571 
572 	if ((tt->page_flags & TTM_TT_FLAG_EXTERNAL) &&
573 	    !(tt->page_flags & TTM_TT_FLAG_EXTERNAL_MAPPABLE))
574 		return;
575 
576 	xe_tt_unmap_sg(xe, tt);
577 
578 	ttm_pool_free(&ttm_dev->pool, tt);
579 	xe_ttm_tt_account_subtract(xe, tt);
580 	update_global_total_pages(ttm_dev, -(long)tt->num_pages);
581 }
582 
583 static void xe_ttm_tt_destroy(struct ttm_device *ttm_dev, struct ttm_tt *tt)
584 {
585 	ttm_tt_fini(tt);
586 	kfree(tt);
587 }
588 
589 static bool xe_ttm_resource_visible(struct ttm_resource *mem)
590 {
591 	struct xe_ttm_vram_mgr_resource *vres =
592 		to_xe_ttm_vram_mgr_resource(mem);
593 
594 	return vres->used_visible_size == mem->size;
595 }
596 
597 /**
598  * xe_bo_is_visible_vram - check if BO is placed entirely in visible VRAM.
599  * @bo: The BO
600  *
601  * This function checks whether a given BO resides entirely in memory visible from the CPU
602  *
603  * Returns: true if the BO is entirely visible, false otherwise.
604  *
605  */
606 bool xe_bo_is_visible_vram(struct xe_bo *bo)
607 {
608 	if (drm_WARN_ON(bo->ttm.base.dev, !xe_bo_is_vram(bo)))
609 		return false;
610 
611 	return xe_ttm_resource_visible(bo->ttm.resource);
612 }
613 
614 static int xe_ttm_io_mem_reserve(struct ttm_device *bdev,
615 				 struct ttm_resource *mem)
616 {
617 	struct xe_device *xe = ttm_to_xe_device(bdev);
618 
619 	switch (mem->mem_type) {
620 	case XE_PL_SYSTEM:
621 	case XE_PL_TT:
622 		return 0;
623 	case XE_PL_VRAM0:
624 	case XE_PL_VRAM1: {
625 		struct xe_vram_region *vram = xe_map_resource_to_region(mem);
626 
627 		if (!xe_ttm_resource_visible(mem))
628 			return -EINVAL;
629 
630 		mem->bus.offset = mem->start << PAGE_SHIFT;
631 
632 		if (vram->mapping &&
633 		    mem->placement & TTM_PL_FLAG_CONTIGUOUS)
634 			mem->bus.addr = (u8 __force *)vram->mapping +
635 				mem->bus.offset;
636 
637 		mem->bus.offset += vram->io_start;
638 		mem->bus.is_iomem = true;
639 
640 #if  !IS_ENABLED(CONFIG_X86)
641 		mem->bus.caching = ttm_write_combined;
642 #endif
643 		return 0;
644 	} case XE_PL_STOLEN:
645 		return xe_ttm_stolen_io_mem_reserve(xe, mem);
646 	default:
647 		return -EINVAL;
648 	}
649 }
650 
651 static int xe_bo_trigger_rebind(struct xe_device *xe, struct xe_bo *bo,
652 				const struct ttm_operation_ctx *ctx)
653 {
654 	struct dma_resv_iter cursor;
655 	struct dma_fence *fence;
656 	struct drm_gem_object *obj = &bo->ttm.base;
657 	struct drm_gpuvm_bo *vm_bo;
658 	bool idle = false;
659 	int ret = 0;
660 
661 	dma_resv_assert_held(bo->ttm.base.resv);
662 
663 	if (!list_empty(&bo->ttm.base.gpuva.list)) {
664 		dma_resv_iter_begin(&cursor, bo->ttm.base.resv,
665 				    DMA_RESV_USAGE_BOOKKEEP);
666 		dma_resv_for_each_fence_unlocked(&cursor, fence)
667 			dma_fence_enable_sw_signaling(fence);
668 		dma_resv_iter_end(&cursor);
669 	}
670 
671 	drm_gem_for_each_gpuvm_bo(vm_bo, obj) {
672 		struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
673 		struct drm_gpuva *gpuva;
674 
675 		if (!xe_vm_in_fault_mode(vm)) {
676 			drm_gpuvm_bo_evict(vm_bo, true);
677 			/*
678 			 * L2 cache may not be flushed, so ensure that is done in
679 			 * xe_vm_invalidate_vma() below
680 			 */
681 			if (!xe_device_is_l2_flush_optimized(xe))
682 				continue;
683 		}
684 
685 		if (!idle) {
686 			long timeout;
687 
688 			if (ctx->no_wait_gpu &&
689 			    !dma_resv_test_signaled(bo->ttm.base.resv,
690 						    DMA_RESV_USAGE_BOOKKEEP))
691 				return -EBUSY;
692 
693 			timeout = dma_resv_wait_timeout(bo->ttm.base.resv,
694 							DMA_RESV_USAGE_BOOKKEEP,
695 							ctx->interruptible,
696 							MAX_SCHEDULE_TIMEOUT);
697 			if (!timeout)
698 				return -ETIME;
699 			if (timeout < 0)
700 				return timeout;
701 
702 			idle = true;
703 		}
704 
705 		drm_gpuvm_bo_for_each_va(gpuva, vm_bo) {
706 			struct xe_vma *vma = gpuva_to_vma(gpuva);
707 
708 			trace_xe_vma_evict(vma);
709 			ret = xe_vm_invalidate_vma(vma);
710 			if (XE_WARN_ON(ret))
711 				return ret;
712 		}
713 	}
714 
715 	return ret;
716 }
717 
718 /*
719  * The dma-buf map_attachment() / unmap_attachment() is hooked up here.
720  * Note that unmapping the attachment is deferred to the next
721  * map_attachment time, or to bo destroy (after idling) whichever comes first.
722  * This is to avoid syncing before unmap_attachment(), assuming that the
723  * caller relies on idling the reservation object before moving the
724  * backing store out. Should that assumption not hold, then we will be able
725  * to unconditionally call unmap_attachment() when moving out to system.
726  */
727 static int xe_bo_move_dmabuf(struct ttm_buffer_object *ttm_bo,
728 			     struct ttm_resource *new_res)
729 {
730 	struct dma_buf_attachment *attach = ttm_bo->base.import_attach;
731 	struct xe_ttm_tt *xe_tt = container_of(ttm_bo->ttm, struct xe_ttm_tt,
732 					       ttm);
733 	struct xe_device *xe = ttm_to_xe_device(ttm_bo->bdev);
734 	bool device_unplugged = drm_dev_is_unplugged(&xe->drm);
735 	struct sg_table *sg;
736 
737 	xe_assert(xe, attach);
738 	xe_assert(xe, ttm_bo->ttm);
739 
740 	if (device_unplugged && new_res->mem_type == XE_PL_SYSTEM &&
741 	    ttm_bo->sg) {
742 		dma_resv_wait_timeout(ttm_bo->base.resv, DMA_RESV_USAGE_BOOKKEEP,
743 				      false, MAX_SCHEDULE_TIMEOUT);
744 		dma_buf_unmap_attachment(attach, ttm_bo->sg, DMA_BIDIRECTIONAL);
745 		ttm_bo->sg = NULL;
746 	}
747 
748 	if (new_res->mem_type == XE_PL_SYSTEM)
749 		goto out;
750 
751 	if (ttm_bo->sg) {
752 		dma_buf_unmap_attachment(attach, ttm_bo->sg, DMA_BIDIRECTIONAL);
753 		ttm_bo->sg = NULL;
754 	}
755 
756 	sg = dma_buf_map_attachment(attach, DMA_BIDIRECTIONAL);
757 	if (IS_ERR(sg))
758 		return PTR_ERR(sg);
759 
760 	ttm_bo->sg = sg;
761 	xe_tt->sg = sg;
762 
763 out:
764 	ttm_bo_move_null(ttm_bo, new_res);
765 
766 	return 0;
767 }
768 
769 /**
770  * xe_bo_move_notify - Notify subsystems of a pending move
771  * @bo: The buffer object
772  * @ctx: The struct ttm_operation_ctx controlling locking and waits.
773  *
774  * This function notifies subsystems of an upcoming buffer move.
775  * Upon receiving such a notification, subsystems should schedule
776  * halting access to the underlying pages and optionally add a fence
777  * to the buffer object's dma_resv object, that signals when access is
778  * stopped. The caller will wait on all dma_resv fences before
779  * starting the move.
780  *
781  * A subsystem may commence access to the object after obtaining
782  * bindings to the new backing memory under the object lock.
783  *
784  * Return: 0 on success, -EINTR or -ERESTARTSYS if interrupted in fault mode,
785  * negative error code on error.
786  */
787 static int xe_bo_move_notify(struct xe_bo *bo,
788 			     const struct ttm_operation_ctx *ctx)
789 {
790 	struct ttm_buffer_object *ttm_bo = &bo->ttm;
791 	struct xe_device *xe = ttm_to_xe_device(ttm_bo->bdev);
792 	struct ttm_resource *old_mem = ttm_bo->resource;
793 	u32 old_mem_type = old_mem ? old_mem->mem_type : XE_PL_SYSTEM;
794 	int ret;
795 
796 	/*
797 	 * If this starts to call into many components, consider
798 	 * using a notification chain here.
799 	 */
800 
801 	if (xe_bo_is_pinned(bo))
802 		return -EINVAL;
803 
804 	xe_bo_vunmap(bo);
805 	ret = xe_bo_trigger_rebind(xe, bo, ctx);
806 	if (ret)
807 		return ret;
808 
809 	/* Don't call move_notify() for imported dma-bufs. */
810 	if (ttm_bo->base.dma_buf && !ttm_bo->base.import_attach)
811 		dma_buf_invalidate_mappings(ttm_bo->base.dma_buf);
812 
813 	/*
814 	 * TTM has already nuked the mmap for us (see ttm_bo_unmap_virtual),
815 	 * so if we moved from VRAM make sure to unlink this from the userfault
816 	 * tracking.
817 	 */
818 	if (mem_type_is_vram(old_mem_type)) {
819 		mutex_lock(&xe->mem_access.vram_userfault.lock);
820 		if (!list_empty(&bo->vram_userfault_link))
821 			list_del_init(&bo->vram_userfault_link);
822 		mutex_unlock(&xe->mem_access.vram_userfault.lock);
823 	}
824 
825 	return 0;
826 }
827 
828 /**
829  * xe_bo_set_purgeable_shrinker() - Update shrinker accounting for purgeable state
830  * @bo: Buffer object
831  * @new_state: New purgeable state being set
832  *
833  * Transfers pages between shrinkable and purgeable buckets when the BO
834  * purgeable state changes. Called automatically from xe_bo_set_purgeable_state().
835  */
836 static void xe_bo_set_purgeable_shrinker(struct xe_bo *bo,
837 					 enum xe_madv_purgeable_state new_state)
838 {
839 	struct ttm_buffer_object *ttm_bo = &bo->ttm;
840 	struct ttm_tt *tt = ttm_bo->ttm;
841 	struct xe_device *xe = ttm_to_xe_device(ttm_bo->bdev);
842 	struct xe_ttm_tt *xe_tt;
843 	long tt_pages;
844 
845 	xe_bo_assert_held(bo);
846 
847 	if (!tt || !ttm_tt_is_populated(tt))
848 		return;
849 
850 	xe_tt = container_of(tt, struct xe_ttm_tt, ttm);
851 	tt_pages = tt->num_pages;
852 
853 	if (!xe_tt->purgeable && new_state == XE_MADV_PURGEABLE_DONTNEED) {
854 		xe_tt->purgeable = true;
855 		/* Transfer pages from shrinkable to purgeable count */
856 		xe_shrinker_mod_pages(xe->mem.shrinker, -tt_pages, tt_pages);
857 	} else if (xe_tt->purgeable && new_state == XE_MADV_PURGEABLE_WILLNEED) {
858 		xe_tt->purgeable = false;
859 		/* Transfer pages from purgeable to shrinkable count */
860 		xe_shrinker_mod_pages(xe->mem.shrinker, tt_pages, -tt_pages);
861 	}
862 }
863 
864 /**
865  * xe_bo_set_purgeable_state() - Set BO purgeable state with validation
866  * @bo: Buffer object
867  * @new_state: New purgeable state
868  *
869  * Sets the purgeable state with lockdep assertions and validates state
870  * transitions. Once a BO is PURGED, it cannot transition to any other state.
871  * Invalid transitions are caught with xe_assert(). Shrinker page accounting
872  * is updated automatically.
873  */
874 void xe_bo_set_purgeable_state(struct xe_bo *bo,
875 			       enum xe_madv_purgeable_state new_state)
876 {
877 	struct xe_device *xe = xe_bo_device(bo);
878 
879 	xe_bo_assert_held(bo);
880 
881 	/* Validate state is one of the known values */
882 	xe_assert(xe, new_state == XE_MADV_PURGEABLE_WILLNEED ||
883 		  new_state == XE_MADV_PURGEABLE_DONTNEED ||
884 		  new_state == XE_MADV_PURGEABLE_PURGED);
885 
886 	/* Once purged, always purged - cannot transition out */
887 	xe_assert(xe, !(bo->madv_purgeable == XE_MADV_PURGEABLE_PURGED &&
888 			new_state != XE_MADV_PURGEABLE_PURGED));
889 
890 	bo->madv_purgeable = new_state;
891 	xe_bo_set_purgeable_shrinker(bo, new_state);
892 }
893 
894 /**
895  * xe_ttm_bo_purge() - Purge buffer object backing store
896  * @ttm_bo: The TTM buffer object to purge
897  * @ctx: TTM operation context
898  *
899  * This function purges the backing store of a BO marked as DONTNEED and
900  * triggers rebind to invalidate stale GPU mappings. For fault-mode VMs,
901  * this zaps the PTEs. The next GPU access will trigger a page fault and
902  * perform NULL rebind (scratch pages or clear PTEs based on VM config).
903  *
904  * Return: 0 on success, negative error code on failure
905  */
906 static int xe_ttm_bo_purge(struct ttm_buffer_object *ttm_bo, struct ttm_operation_ctx *ctx)
907 {
908 	struct xe_bo *bo = ttm_to_xe_bo(ttm_bo);
909 	struct ttm_placement place = {};
910 	int ret;
911 
912 	xe_bo_assert_held(bo);
913 
914 	if (!ttm_bo->ttm)
915 		return 0;
916 
917 	if (!xe_bo_madv_is_dontneed(bo))
918 		return 0;
919 
920 	/*
921 	 * Use the standard pre-move hook so we share the same cleanup/invalidate
922 	 * path as migrations: drop any CPU vmap and schedule the necessary GPU
923 	 * unbind/rebind work.
924 	 *
925 	 * This must be called before ttm_bo_validate() frees the pages.
926 	 * May fail in no-wait contexts (fault/shrinker) or if the BO is
927 	 * pinned. Keep state unchanged on failure so we don't end up "PURGED"
928 	 * with stale mappings.
929 	 */
930 	ret = xe_bo_move_notify(bo, ctx);
931 	if (ret)
932 		return ret;
933 
934 	ret = ttm_bo_validate(ttm_bo, &place, ctx);
935 	if (ret)
936 		return ret;
937 
938 	/* Commit the state transition only once invalidation was queued */
939 	xe_bo_set_purgeable_state(bo, XE_MADV_PURGEABLE_PURGED);
940 
941 	return 0;
942 }
943 
944 static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
945 		      struct ttm_operation_ctx *ctx,
946 		      struct ttm_resource *new_mem,
947 		      struct ttm_place *hop)
948 {
949 	struct xe_device *xe = ttm_to_xe_device(ttm_bo->bdev);
950 	struct xe_bo *bo = ttm_to_xe_bo(ttm_bo);
951 	struct ttm_resource *old_mem = ttm_bo->resource;
952 	u32 old_mem_type = old_mem ? old_mem->mem_type : XE_PL_SYSTEM;
953 	struct ttm_tt *ttm = ttm_bo->ttm;
954 	struct xe_migrate *migrate = NULL;
955 	struct dma_fence *fence;
956 	bool move_lacks_source;
957 	bool tt_has_data;
958 	bool needs_clear;
959 	bool handle_system_ccs = (!IS_DGFX(xe) && xe_bo_needs_ccs_pages(bo) &&
960 				  ttm && ttm_tt_is_populated(ttm)) ? true : false;
961 	int ret = 0;
962 
963 	/*
964 	 * Purge only non-shared BOs explicitly marked DONTNEED by userspace.
965 	 * The move_notify callback will handle invalidation asynchronously.
966 	 */
967 	if (evict && xe_bo_madv_is_dontneed(bo)) {
968 		ret = xe_ttm_bo_purge(ttm_bo, ctx);
969 		if (ret)
970 			return ret;
971 
972 		/* Free the unused eviction destination resource */
973 		ttm_resource_free(ttm_bo, &new_mem);
974 		return 0;
975 	}
976 
977 	/* Bo creation path, moving to system or TT. */
978 	if ((!old_mem && ttm) && !handle_system_ccs) {
979 		if (new_mem->mem_type == XE_PL_TT)
980 			ret = xe_tt_map_sg(xe, ttm);
981 		if (!ret)
982 			ttm_bo_move_null(ttm_bo, new_mem);
983 		goto out;
984 	}
985 
986 	if (ttm_bo->type == ttm_bo_type_sg) {
987 		if (new_mem->mem_type == XE_PL_SYSTEM)
988 			ret = xe_bo_move_notify(bo, ctx);
989 		if (!ret)
990 			ret = xe_bo_move_dmabuf(ttm_bo, new_mem);
991 		return ret;
992 	}
993 
994 	tt_has_data = ttm && (ttm_tt_is_populated(ttm) || ttm_tt_is_swapped(ttm));
995 
996 	move_lacks_source = !old_mem || (handle_system_ccs ? (!bo->ccs_cleared) :
997 					 (!mem_type_is_vram(old_mem_type) && !tt_has_data));
998 
999 	needs_clear = (ttm && ttm->page_flags & TTM_TT_FLAG_ZERO_ALLOC) ||
1000 		(!ttm && ttm_bo->type == ttm_bo_type_device);
1001 
1002 	if (new_mem->mem_type == XE_PL_TT) {
1003 		ret = xe_tt_map_sg(xe, ttm);
1004 		if (ret)
1005 			goto out;
1006 	}
1007 
1008 	if ((move_lacks_source && !needs_clear)) {
1009 		ttm_bo_move_null(ttm_bo, new_mem);
1010 		goto out;
1011 	}
1012 
1013 	if (!move_lacks_source && (bo->flags & XE_BO_FLAG_CPU_ADDR_MIRROR) &&
1014 	    new_mem->mem_type == XE_PL_SYSTEM) {
1015 		ret = xe_svm_bo_evict(bo);
1016 		if (!ret) {
1017 			drm_dbg(&xe->drm, "Evict system allocator BO success\n");
1018 			ttm_bo_move_null(ttm_bo, new_mem);
1019 		} else {
1020 			drm_dbg(&xe->drm, "Evict system allocator BO failed=%pe\n",
1021 				ERR_PTR(ret));
1022 		}
1023 
1024 		goto out;
1025 	}
1026 
1027 	if (old_mem_type == XE_PL_SYSTEM && new_mem->mem_type == XE_PL_TT && !handle_system_ccs) {
1028 		ttm_bo_move_null(ttm_bo, new_mem);
1029 		goto out;
1030 	}
1031 
1032 	/*
1033 	 * Failed multi-hop where the old_mem is still marked as
1034 	 * TTM_PL_FLAG_TEMPORARY, should just be a dummy move.
1035 	 */
1036 	if (old_mem_type == XE_PL_TT &&
1037 	    new_mem->mem_type == XE_PL_TT) {
1038 		ttm_bo_move_null(ttm_bo, new_mem);
1039 		goto out;
1040 	}
1041 
1042 	if (!move_lacks_source && !xe_bo_is_pinned(bo)) {
1043 		ret = xe_bo_move_notify(bo, ctx);
1044 		if (ret)
1045 			goto out;
1046 	}
1047 
1048 	if (old_mem_type == XE_PL_TT &&
1049 	    new_mem->mem_type == XE_PL_SYSTEM) {
1050 		long timeout = dma_resv_wait_timeout(ttm_bo->base.resv,
1051 						     DMA_RESV_USAGE_BOOKKEEP,
1052 						     false,
1053 						     MAX_SCHEDULE_TIMEOUT);
1054 		if (timeout < 0) {
1055 			ret = timeout;
1056 			goto out;
1057 		}
1058 
1059 		if (!handle_system_ccs) {
1060 			ttm_bo_move_null(ttm_bo, new_mem);
1061 			goto out;
1062 		}
1063 	}
1064 
1065 	if (!move_lacks_source &&
1066 	    ((old_mem_type == XE_PL_SYSTEM && resource_is_vram(new_mem)) ||
1067 	     (mem_type_is_vram(old_mem_type) &&
1068 	      new_mem->mem_type == XE_PL_SYSTEM))) {
1069 		hop->fpfn = 0;
1070 		hop->lpfn = 0;
1071 		hop->mem_type = XE_PL_TT;
1072 		hop->flags = TTM_PL_FLAG_TEMPORARY;
1073 		ret = -EMULTIHOP;
1074 		goto out;
1075 	}
1076 
1077 	if (bo->tile)
1078 		migrate = bo->tile->migrate;
1079 	else if (resource_is_vram(new_mem))
1080 		migrate = mem_type_to_migrate(xe, new_mem->mem_type);
1081 	else if (mem_type_is_vram(old_mem_type))
1082 		migrate = mem_type_to_migrate(xe, old_mem_type);
1083 	else
1084 		migrate = xe->tiles[0].migrate;
1085 
1086 	xe_assert(xe, migrate);
1087 	trace_xe_bo_move(bo, new_mem->mem_type, old_mem_type, move_lacks_source);
1088 	if (xe_rpm_reclaim_safe(xe)) {
1089 		/*
1090 		 * We might be called through swapout in the validation path of
1091 		 * another TTM device, so acquire rpm here.
1092 		 */
1093 		xe_pm_runtime_get(xe);
1094 	} else {
1095 		drm_WARN_ON(&xe->drm, handle_system_ccs);
1096 		xe_pm_runtime_get_noresume(xe);
1097 	}
1098 
1099 	if (move_lacks_source) {
1100 		u32 flags = 0;
1101 
1102 		if (mem_type_is_vram(new_mem->mem_type))
1103 			flags |= XE_MIGRATE_CLEAR_FLAG_FULL;
1104 		else if (handle_system_ccs)
1105 			flags |= XE_MIGRATE_CLEAR_FLAG_CCS_DATA;
1106 
1107 		fence = xe_migrate_clear(migrate, bo, new_mem, flags);
1108 	} else {
1109 		fence = xe_migrate_copy(migrate, bo, bo, old_mem, new_mem,
1110 					handle_system_ccs);
1111 	}
1112 	if (IS_ERR(fence)) {
1113 		ret = PTR_ERR(fence);
1114 		xe_pm_runtime_put(xe);
1115 		goto out;
1116 	}
1117 	if (!move_lacks_source) {
1118 		ret = ttm_bo_move_accel_cleanup(ttm_bo, fence, evict, true,
1119 						new_mem);
1120 		if (ret) {
1121 			dma_fence_wait(fence, false);
1122 			ttm_bo_move_null(ttm_bo, new_mem);
1123 			ret = 0;
1124 		}
1125 	} else {
1126 		/*
1127 		 * ttm_bo_move_accel_cleanup() may blow up if
1128 		 * bo->resource == NULL, so just attach the
1129 		 * fence and set the new resource.
1130 		 */
1131 		dma_resv_add_fence(ttm_bo->base.resv, fence,
1132 				   DMA_RESV_USAGE_KERNEL);
1133 		ttm_bo_move_null(ttm_bo, new_mem);
1134 	}
1135 
1136 	dma_fence_put(fence);
1137 	xe_pm_runtime_put(xe);
1138 
1139 	/*
1140 	 * CCS meta data is migrated from TT -> SMEM. So, let us detach the
1141 	 * BBs from BO as it is no longer needed.
1142 	 */
1143 	if (IS_VF_CCS_READY(xe) && old_mem_type == XE_PL_TT &&
1144 	    new_mem->mem_type == XE_PL_SYSTEM)
1145 		xe_sriov_vf_ccs_detach_bo(bo);
1146 
1147 	if (IS_VF_CCS_READY(xe) &&
1148 	    ((move_lacks_source && new_mem->mem_type == XE_PL_TT) ||
1149 	     (old_mem_type == XE_PL_SYSTEM && new_mem->mem_type == XE_PL_TT)) &&
1150 	    handle_system_ccs)
1151 		ret = xe_sriov_vf_ccs_attach_bo(bo);
1152 
1153 out:
1154 	if ((!ttm_bo->resource || ttm_bo->resource->mem_type == XE_PL_SYSTEM) &&
1155 	    ttm_bo->ttm) {
1156 		long timeout = dma_resv_wait_timeout(ttm_bo->base.resv,
1157 						     DMA_RESV_USAGE_KERNEL,
1158 						     false,
1159 						     MAX_SCHEDULE_TIMEOUT);
1160 		if (timeout < 0)
1161 			ret = timeout;
1162 
1163 		if (IS_VF_CCS_READY(xe))
1164 			xe_sriov_vf_ccs_detach_bo(bo);
1165 
1166 		xe_tt_unmap_sg(xe, ttm_bo->ttm);
1167 	}
1168 
1169 	return ret;
1170 }
1171 
1172 static long xe_bo_shrink_purge(struct ttm_operation_ctx *ctx,
1173 			       struct ttm_buffer_object *bo,
1174 			       unsigned long *scanned)
1175 {
1176 	struct xe_device *xe = ttm_to_xe_device(bo->bdev);
1177 	struct ttm_tt *tt = bo->ttm;
1178 	long lret;
1179 
1180 	/* Fake move to system, without copying data. */
1181 	if (bo->resource->mem_type != XE_PL_SYSTEM) {
1182 		struct ttm_resource *new_resource;
1183 
1184 		lret = ttm_bo_wait_ctx(bo, ctx);
1185 		if (lret)
1186 			return lret;
1187 
1188 		lret = ttm_bo_mem_space(bo, &sys_placement, &new_resource, ctx);
1189 		if (lret)
1190 			return lret;
1191 
1192 		xe_tt_unmap_sg(xe, bo->ttm);
1193 		ttm_bo_move_null(bo, new_resource);
1194 	}
1195 
1196 	*scanned += bo->ttm->num_pages;
1197 	lret = ttm_bo_shrink(ctx, bo, (struct ttm_bo_shrink_flags)
1198 			     {.purge = true,
1199 			      .writeback = false,
1200 			      .allow_move = false});
1201 
1202 	if (lret > 0) {
1203 		xe_ttm_tt_account_subtract(xe, bo->ttm);
1204 		update_global_total_pages(bo->bdev, -(long)tt->num_pages);
1205 	}
1206 
1207 	return lret;
1208 }
1209 
1210 static bool
1211 xe_bo_eviction_valuable(struct ttm_buffer_object *bo, const struct ttm_place *place)
1212 {
1213 	struct drm_gpuvm_bo *vm_bo;
1214 
1215 	if (!ttm_bo_eviction_valuable(bo, place))
1216 		return false;
1217 
1218 	if (!xe_bo_is_xe_bo(bo))
1219 		return true;
1220 
1221 	drm_gem_for_each_gpuvm_bo(vm_bo, &bo->base) {
1222 		if (xe_vm_is_validating(gpuvm_to_vm(vm_bo->vm)))
1223 			return false;
1224 	}
1225 
1226 	return true;
1227 }
1228 
1229 /**
1230  * xe_bo_shrink() - Try to shrink an xe bo.
1231  * @ctx: The struct ttm_operation_ctx used for shrinking.
1232  * @bo: The TTM buffer object whose pages to shrink.
1233  * @flags: Flags governing the shrink behaviour.
1234  * @scanned: Pointer to a counter of the number of pages
1235  * attempted to shrink.
1236  *
1237  * Try to shrink- or purge a bo, and if it succeeds, unmap dma.
1238  * Note that we need to be able to handle also non xe bos
1239  * (ghost bos), but only if the struct ttm_tt is embedded in
1240  * a struct xe_ttm_tt. When the function attempts to shrink
1241  * the pages of a buffer object, The value pointed to by @scanned
1242  * is updated.
1243  *
1244  * Return: The number of pages shrunken or purged, or negative error
1245  * code on failure.
1246  */
1247 long xe_bo_shrink(struct ttm_operation_ctx *ctx, struct ttm_buffer_object *bo,
1248 		  const struct xe_bo_shrink_flags flags,
1249 		  unsigned long *scanned)
1250 {
1251 	struct ttm_tt *tt = bo->ttm;
1252 	struct xe_ttm_tt *xe_tt = container_of(tt, struct xe_ttm_tt, ttm);
1253 	struct ttm_place place = {.mem_type = bo->resource->mem_type};
1254 	struct xe_bo *xe_bo = ttm_to_xe_bo(bo);
1255 	struct xe_device *xe = ttm_to_xe_device(bo->bdev);
1256 	bool needs_rpm;
1257 	long lret = 0L;
1258 
1259 	if (!(tt->page_flags & TTM_TT_FLAG_EXTERNAL_MAPPABLE) ||
1260 	    (flags.purge && !xe_tt->purgeable))
1261 		return -EBUSY;
1262 
1263 	if (!xe_bo_eviction_valuable(bo, &place))
1264 		return -EBUSY;
1265 
1266 	if (!xe_bo_is_xe_bo(bo) || !xe_bo_get_unless_zero(xe_bo))
1267 		return xe_bo_shrink_purge(ctx, bo, scanned);
1268 
1269 	if (xe_tt->purgeable) {
1270 		if (bo->resource->mem_type != XE_PL_SYSTEM)
1271 			lret = xe_bo_move_notify(xe_bo, ctx);
1272 		if (!lret)
1273 			lret = xe_bo_shrink_purge(ctx, bo, scanned);
1274 		if (lret > 0 && xe_bo_madv_is_dontneed(xe_bo))
1275 			xe_bo_set_purgeable_state(xe_bo,
1276 						  XE_MADV_PURGEABLE_PURGED);
1277 		goto out_unref;
1278 	}
1279 
1280 	/* System CCS needs gpu copy when moving PL_TT -> PL_SYSTEM */
1281 	needs_rpm = (!IS_DGFX(xe) && bo->resource->mem_type != XE_PL_SYSTEM &&
1282 		     xe_bo_needs_ccs_pages(xe_bo));
1283 	if (needs_rpm && !xe_pm_runtime_get_if_active(xe))
1284 		goto out_unref;
1285 
1286 	*scanned += tt->num_pages;
1287 	lret = ttm_bo_shrink(ctx, bo, (struct ttm_bo_shrink_flags)
1288 			     {.purge = false,
1289 			      .writeback = flags.writeback,
1290 			      .allow_move = true});
1291 	if (needs_rpm)
1292 		xe_pm_runtime_put(xe);
1293 
1294 	if (lret > 0) {
1295 		xe_ttm_tt_account_subtract(xe, tt);
1296 		update_global_total_pages(bo->bdev, -(long)tt->num_pages);
1297 	}
1298 
1299 out_unref:
1300 	xe_bo_put(xe_bo);
1301 
1302 	return lret;
1303 }
1304 
1305 /**
1306  * xe_bo_notifier_prepare_pinned() - Prepare a pinned VRAM object to be backed
1307  * up in system memory.
1308  * @bo: The buffer object to prepare.
1309  *
1310  * On successful completion, the object backup pages are allocated. Expectation
1311  * is that this is called from the PM notifier, prior to suspend/hibernation.
1312  *
1313  * Return: 0 on success. Negative error code on failure.
1314  */
1315 int xe_bo_notifier_prepare_pinned(struct xe_bo *bo)
1316 {
1317 	struct xe_device *xe = ttm_to_xe_device(bo->ttm.bdev);
1318 	struct xe_validation_ctx ctx;
1319 	struct drm_exec exec;
1320 	struct xe_bo *backup;
1321 	int ret = 0;
1322 
1323 	xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {.exclusive = true}, ret) {
1324 		ret = drm_exec_lock_obj(&exec, &bo->ttm.base);
1325 		drm_exec_retry_on_contention(&exec);
1326 		xe_assert(xe, !ret);
1327 		xe_assert(xe, !bo->backup_obj);
1328 
1329 		/*
1330 		 * Since this is called from the PM notifier we might have raced with
1331 		 * someone unpinning this after we dropped the pinned list lock and
1332 		 * grabbing the above bo lock.
1333 		 */
1334 		if (!xe_bo_is_pinned(bo))
1335 			break;
1336 
1337 		if (!xe_bo_is_vram(bo))
1338 			break;
1339 
1340 		if (bo->flags & XE_BO_FLAG_PINNED_NORESTORE)
1341 			break;
1342 
1343 		backup = xe_bo_init_locked(xe, NULL, NULL, bo->ttm.base.resv, NULL, xe_bo_size(bo),
1344 					   DRM_XE_GEM_CPU_CACHING_WB, ttm_bo_type_kernel,
1345 					   XE_BO_FLAG_SYSTEM | XE_BO_FLAG_NEEDS_CPU_ACCESS |
1346 					   XE_BO_FLAG_PINNED, &exec);
1347 		if (IS_ERR(backup)) {
1348 			drm_exec_retry_on_contention(&exec);
1349 			ret = PTR_ERR(backup);
1350 			xe_validation_retry_on_oom(&ctx, &ret);
1351 			break;
1352 		}
1353 
1354 		backup->parent_obj = xe_bo_get(bo); /* Released by bo_destroy */
1355 		ttm_bo_pin(&backup->ttm);
1356 		bo->backup_obj = backup;
1357 	}
1358 
1359 	return ret;
1360 }
1361 
1362 /**
1363  * xe_bo_notifier_unprepare_pinned() - Undo the previous prepare operation.
1364  * @bo: The buffer object to undo the prepare for.
1365  *
1366  * Always returns 0. The backup object is removed, if still present. Expectation
1367  * it that this called from the PM notifier when undoing the prepare step.
1368  *
1369  * Return: Always returns 0.
1370  */
1371 int xe_bo_notifier_unprepare_pinned(struct xe_bo *bo)
1372 {
1373 	xe_bo_lock(bo, false);
1374 	if (bo->backup_obj) {
1375 		ttm_bo_unpin(&bo->backup_obj->ttm);
1376 		xe_bo_put(bo->backup_obj);
1377 		bo->backup_obj = NULL;
1378 	}
1379 	xe_bo_unlock(bo);
1380 
1381 	return 0;
1382 }
1383 
1384 static int xe_bo_evict_pinned_copy(struct xe_bo *bo, struct xe_bo *backup)
1385 {
1386 	struct xe_device *xe = xe_bo_device(bo);
1387 	bool unmap = false;
1388 	int ret = 0;
1389 
1390 	if (xe_bo_is_user(bo) || (bo->flags & XE_BO_FLAG_PINNED_LATE_RESTORE)) {
1391 		struct xe_migrate *migrate;
1392 		struct dma_fence *fence;
1393 
1394 		if (bo->tile)
1395 			migrate = bo->tile->migrate;
1396 		else
1397 			migrate = mem_type_to_migrate(xe, bo->ttm.resource->mem_type);
1398 
1399 		xe_assert(xe, bo->ttm.base.resv == backup->ttm.base.resv);
1400 		ret = dma_resv_reserve_fences(bo->ttm.base.resv, 1);
1401 		if (ret)
1402 			goto out_backup;
1403 
1404 		fence = xe_migrate_copy(migrate, bo, backup, bo->ttm.resource,
1405 					backup->ttm.resource, false);
1406 		if (IS_ERR(fence)) {
1407 			ret = PTR_ERR(fence);
1408 			goto out_backup;
1409 		}
1410 
1411 		dma_resv_add_fence(bo->ttm.base.resv, fence,
1412 				   DMA_RESV_USAGE_KERNEL);
1413 		dma_fence_put(fence);
1414 	} else {
1415 		ret = xe_bo_vmap(backup);
1416 		if (ret)
1417 			goto out_backup;
1418 
1419 		if (iosys_map_is_null(&bo->vmap)) {
1420 			ret = xe_bo_vmap(bo);
1421 			if (ret)
1422 				goto out_vunmap;
1423 			unmap = true;
1424 		}
1425 
1426 		xe_map_memcpy_from(xe, backup->vmap.vaddr, &bo->vmap, 0,
1427 				   xe_bo_size(bo));
1428 	}
1429 
1430 	if (!bo->backup_obj)
1431 		bo->backup_obj = backup;
1432 out_vunmap:
1433 	xe_bo_vunmap(backup);
1434 out_backup:
1435 	if (unmap)
1436 		xe_bo_vunmap(bo);
1437 
1438 	return ret;
1439 }
1440 
1441 /**
1442  * xe_bo_evict_pinned() - Evict a pinned VRAM object to system memory
1443  * @bo: The buffer object to move.
1444  *
1445  * On successful completion, the object memory will be moved to system memory.
1446  *
1447  * This is needed to for special handling of pinned VRAM object during
1448  * suspend-resume.
1449  *
1450  * Return: 0 on success. Negative error code on failure.
1451  */
1452 int xe_bo_evict_pinned(struct xe_bo *bo)
1453 {
1454 	struct xe_device *xe = ttm_to_xe_device(bo->ttm.bdev);
1455 	struct xe_validation_ctx ctx;
1456 	struct drm_exec exec;
1457 	struct xe_bo *backup = bo->backup_obj;
1458 	bool backup_created = false;
1459 	int ret = 0;
1460 
1461 	xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {.exclusive = true}, ret) {
1462 		ret = drm_exec_lock_obj(&exec, &bo->ttm.base);
1463 		drm_exec_retry_on_contention(&exec);
1464 		xe_assert(xe, !ret);
1465 
1466 		if (WARN_ON(!bo->ttm.resource)) {
1467 			ret = -EINVAL;
1468 			break;
1469 		}
1470 
1471 		if (WARN_ON(!xe_bo_is_pinned(bo))) {
1472 			ret = -EINVAL;
1473 			break;
1474 		}
1475 
1476 		if (!xe_bo_is_vram(bo))
1477 			break;
1478 
1479 		if (bo->flags & XE_BO_FLAG_PINNED_NORESTORE)
1480 			break;
1481 
1482 		if (!backup) {
1483 			backup = xe_bo_init_locked(xe, NULL, NULL, bo->ttm.base.resv, NULL,
1484 						   xe_bo_size(bo),
1485 						   DRM_XE_GEM_CPU_CACHING_WB, ttm_bo_type_kernel,
1486 						   XE_BO_FLAG_SYSTEM | XE_BO_FLAG_NEEDS_CPU_ACCESS |
1487 						   XE_BO_FLAG_PINNED, &exec);
1488 			if (IS_ERR(backup)) {
1489 				drm_exec_retry_on_contention(&exec);
1490 				ret = PTR_ERR(backup);
1491 				xe_validation_retry_on_oom(&ctx, &ret);
1492 				break;
1493 			}
1494 			backup->parent_obj = xe_bo_get(bo); /* Released by bo_destroy */
1495 			backup_created = true;
1496 		}
1497 
1498 		ret = xe_bo_evict_pinned_copy(bo, backup);
1499 	}
1500 
1501 	if (ret && backup_created)
1502 		xe_bo_put(backup);
1503 
1504 	return ret;
1505 }
1506 
1507 /**
1508  * xe_bo_restore_pinned() - Restore a pinned VRAM object
1509  * @bo: The buffer object to move.
1510  *
1511  * On successful completion, the object memory will be moved back to VRAM.
1512  *
1513  * This is needed to for special handling of pinned VRAM object during
1514  * suspend-resume.
1515  *
1516  * Return: 0 on success. Negative error code on failure.
1517  */
1518 int xe_bo_restore_pinned(struct xe_bo *bo)
1519 {
1520 	struct ttm_operation_ctx ctx = {
1521 		.interruptible = false,
1522 		.gfp_retry_mayfail = false,
1523 	};
1524 	struct xe_device *xe = ttm_to_xe_device(bo->ttm.bdev);
1525 	struct xe_bo *backup = bo->backup_obj;
1526 	bool unmap = false;
1527 	int ret;
1528 
1529 	if (!backup)
1530 		return 0;
1531 
1532 	xe_bo_lock(bo, false);
1533 
1534 	if (!xe_bo_is_pinned(backup)) {
1535 		ret = ttm_bo_validate(&backup->ttm, &backup->placement, &ctx);
1536 		if (ret)
1537 			goto out_unlock_bo;
1538 	}
1539 
1540 	if (xe_bo_is_user(bo) || (bo->flags & XE_BO_FLAG_PINNED_LATE_RESTORE)) {
1541 		struct xe_migrate *migrate;
1542 		struct dma_fence *fence;
1543 
1544 		if (bo->tile)
1545 			migrate = bo->tile->migrate;
1546 		else
1547 			migrate = mem_type_to_migrate(xe, bo->ttm.resource->mem_type);
1548 
1549 		ret = dma_resv_reserve_fences(bo->ttm.base.resv, 1);
1550 		if (ret)
1551 			goto out_unlock_bo;
1552 
1553 		fence = xe_migrate_copy(migrate, backup, bo,
1554 					backup->ttm.resource, bo->ttm.resource,
1555 					false);
1556 		if (IS_ERR(fence)) {
1557 			ret = PTR_ERR(fence);
1558 			goto out_unlock_bo;
1559 		}
1560 
1561 		dma_resv_add_fence(bo->ttm.base.resv, fence,
1562 				   DMA_RESV_USAGE_KERNEL);
1563 		dma_fence_put(fence);
1564 	} else {
1565 		ret = xe_bo_vmap(backup);
1566 		if (ret)
1567 			goto out_unlock_bo;
1568 
1569 		if (iosys_map_is_null(&bo->vmap)) {
1570 			ret = xe_bo_vmap(bo);
1571 			if (ret)
1572 				goto out_backup;
1573 			unmap = true;
1574 		}
1575 
1576 		xe_map_memcpy_to(xe, &bo->vmap, 0, backup->vmap.vaddr,
1577 				 xe_bo_size(bo));
1578 	}
1579 
1580 	bo->backup_obj = NULL;
1581 
1582 out_backup:
1583 	xe_bo_vunmap(backup);
1584 	if (!bo->backup_obj) {
1585 		if (xe_bo_is_pinned(backup))
1586 			ttm_bo_unpin(&backup->ttm);
1587 		xe_bo_put(backup);
1588 	}
1589 out_unlock_bo:
1590 	if (unmap)
1591 		xe_bo_vunmap(bo);
1592 	xe_bo_unlock(bo);
1593 	return ret;
1594 }
1595 
1596 int xe_bo_dma_unmap_pinned(struct xe_bo *bo)
1597 {
1598 	struct ttm_buffer_object *ttm_bo = &bo->ttm;
1599 	struct ttm_tt *tt = ttm_bo->ttm;
1600 
1601 	if (tt) {
1602 		struct xe_ttm_tt *xe_tt = container_of(tt, typeof(*xe_tt), ttm);
1603 
1604 		if (ttm_bo->type == ttm_bo_type_sg && ttm_bo->sg) {
1605 			dma_buf_unmap_attachment(ttm_bo->base.import_attach,
1606 						 ttm_bo->sg,
1607 						 DMA_BIDIRECTIONAL);
1608 			ttm_bo->sg = NULL;
1609 			xe_tt->sg = NULL;
1610 		} else if (xe_tt->sg) {
1611 			dma_unmap_sgtable(ttm_to_xe_device(ttm_bo->bdev)->drm.dev,
1612 					  xe_tt->sg,
1613 					  DMA_BIDIRECTIONAL, 0);
1614 			sg_free_table(xe_tt->sg);
1615 			xe_tt->sg = NULL;
1616 		}
1617 	}
1618 
1619 	return 0;
1620 }
1621 
1622 static unsigned long xe_ttm_io_mem_pfn(struct ttm_buffer_object *ttm_bo,
1623 				       unsigned long page_offset)
1624 {
1625 	struct xe_bo *bo = ttm_to_xe_bo(ttm_bo);
1626 	struct xe_res_cursor cursor;
1627 	struct xe_vram_region *vram;
1628 
1629 	if (ttm_bo->resource->mem_type == XE_PL_STOLEN)
1630 		return xe_ttm_stolen_io_offset(bo, page_offset << PAGE_SHIFT) >> PAGE_SHIFT;
1631 
1632 	vram = xe_map_resource_to_region(ttm_bo->resource);
1633 	xe_res_first(ttm_bo->resource, (u64)page_offset << PAGE_SHIFT, 0, &cursor);
1634 	return (vram->io_start + cursor.start) >> PAGE_SHIFT;
1635 }
1636 
1637 static void __xe_bo_vunmap(struct xe_bo *bo);
1638 
1639 /*
1640  * TODO: Move this function to TTM so we don't rely on how TTM does its
1641  * locking, thereby abusing TTM internals.
1642  */
1643 static bool xe_ttm_bo_lock_in_destructor(struct ttm_buffer_object *ttm_bo)
1644 {
1645 	struct xe_device *xe = ttm_to_xe_device(ttm_bo->bdev);
1646 	bool locked;
1647 
1648 	xe_assert(xe, !kref_read(&ttm_bo->kref));
1649 
1650 	/*
1651 	 * We can typically only race with TTM trylocking under the
1652 	 * lru_lock, which will immediately be unlocked again since
1653 	 * the ttm_bo refcount is zero at this point. So trylocking *should*
1654 	 * always succeed here, as long as we hold the lru lock.
1655 	 */
1656 	spin_lock(&ttm_bo->bdev->lru_lock);
1657 	locked = dma_resv_trylock(&ttm_bo->base._resv);
1658 	spin_unlock(&ttm_bo->bdev->lru_lock);
1659 	xe_assert(xe, locked);
1660 
1661 	return locked;
1662 }
1663 
1664 static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo)
1665 {
1666 	struct dma_resv_iter cursor;
1667 	struct dma_fence *fence;
1668 	struct dma_fence *replacement = NULL;
1669 	struct xe_bo *bo;
1670 
1671 	if (!xe_bo_is_xe_bo(ttm_bo))
1672 		return;
1673 
1674 	bo = ttm_to_xe_bo(ttm_bo);
1675 	xe_assert(xe_bo_device(bo), !(bo->created && kref_read(&ttm_bo->base.refcount)));
1676 
1677 	if (!xe_ttm_bo_lock_in_destructor(ttm_bo))
1678 		return;
1679 
1680 	/*
1681 	 * Scrub the preempt fences if any. The unbind fence is already
1682 	 * attached to the resv.
1683 	 * TODO: Don't do this for external bos once we scrub them after
1684 	 * unbind.
1685 	 */
1686 	dma_resv_for_each_fence(&cursor, &ttm_bo->base._resv,
1687 				DMA_RESV_USAGE_BOOKKEEP, fence) {
1688 		if (xe_fence_is_xe_preempt(fence) &&
1689 		    !dma_fence_is_signaled(fence)) {
1690 			if (!replacement)
1691 				replacement = dma_fence_get_stub();
1692 
1693 			dma_resv_replace_fences(&ttm_bo->base._resv,
1694 						fence->context,
1695 						replacement,
1696 						DMA_RESV_USAGE_BOOKKEEP);
1697 		}
1698 	}
1699 	dma_fence_put(replacement);
1700 
1701 	dma_resv_unlock(&ttm_bo->base._resv);
1702 }
1703 
1704 static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object *ttm_bo)
1705 {
1706 	struct xe_bo *bo = ttm_to_xe_bo(ttm_bo);
1707 
1708 	if (!xe_bo_is_xe_bo(ttm_bo))
1709 		return;
1710 
1711 	if (IS_VF_CCS_READY(ttm_to_xe_device(ttm_bo->bdev)))
1712 		xe_sriov_vf_ccs_detach_bo(bo);
1713 
1714 	/*
1715 	 * Object is idle and about to be destroyed. Release the
1716 	 * dma-buf attachment.
1717 	 */
1718 	if (ttm_bo->type == ttm_bo_type_sg && ttm_bo->sg) {
1719 		struct xe_ttm_tt *xe_tt = container_of(ttm_bo->ttm,
1720 						       struct xe_ttm_tt, ttm);
1721 
1722 		dma_buf_unmap_attachment(ttm_bo->base.import_attach, ttm_bo->sg,
1723 					 DMA_BIDIRECTIONAL);
1724 		ttm_bo->sg = NULL;
1725 		xe_tt->sg = NULL;
1726 	}
1727 }
1728 
1729 static void xe_ttm_bo_swap_notify(struct ttm_buffer_object *ttm_bo)
1730 {
1731 	struct ttm_operation_ctx ctx = {
1732 		.interruptible = false,
1733 		.gfp_retry_mayfail = false,
1734 	};
1735 
1736 	if (ttm_bo->ttm) {
1737 		struct xe_ttm_tt *xe_tt =
1738 			container_of(ttm_bo->ttm, struct xe_ttm_tt, ttm);
1739 
1740 		if (xe_tt->purgeable)
1741 			xe_ttm_bo_purge(ttm_bo, &ctx);
1742 	}
1743 }
1744 
1745 static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
1746 				unsigned long offset, void *buf, int len,
1747 				int write)
1748 {
1749 	struct xe_bo *bo = ttm_to_xe_bo(ttm_bo);
1750 	struct xe_device *xe = ttm_to_xe_device(ttm_bo->bdev);
1751 	struct iosys_map vmap;
1752 	struct xe_res_cursor cursor;
1753 	struct xe_vram_region *vram;
1754 	int bytes_left = len;
1755 	int err = 0;
1756 
1757 	xe_bo_assert_held(bo);
1758 	xe_device_assert_mem_access(xe);
1759 
1760 	if (!mem_type_is_vram(ttm_bo->resource->mem_type))
1761 		return -EIO;
1762 
1763 	if (!xe_bo_is_visible_vram(bo) || len >= SZ_16K) {
1764 		struct xe_migrate *migrate =
1765 			mem_type_to_migrate(xe, ttm_bo->resource->mem_type);
1766 
1767 		err = xe_migrate_access_memory(migrate, bo, offset, buf, len,
1768 					       write);
1769 		goto out;
1770 	}
1771 
1772 	vram = xe_map_resource_to_region(ttm_bo->resource);
1773 	xe_res_first(ttm_bo->resource, offset & PAGE_MASK,
1774 		     xe_bo_size(bo) - (offset & PAGE_MASK), &cursor);
1775 
1776 	do {
1777 		unsigned long page_offset = (offset & ~PAGE_MASK);
1778 		int byte_count = min((int)(PAGE_SIZE - page_offset), bytes_left);
1779 
1780 		iosys_map_set_vaddr_iomem(&vmap, (u8 __iomem *)vram->mapping +
1781 					  cursor.start);
1782 		if (write)
1783 			xe_map_memcpy_to(xe, &vmap, page_offset, buf, byte_count);
1784 		else
1785 			xe_map_memcpy_from(xe, buf, &vmap, page_offset, byte_count);
1786 
1787 		buf += byte_count;
1788 		offset += byte_count;
1789 		bytes_left -= byte_count;
1790 		if (bytes_left)
1791 			xe_res_next(&cursor, PAGE_SIZE);
1792 	} while (bytes_left);
1793 
1794 out:
1795 	return err ?: len;
1796 }
1797 
1798 const struct ttm_device_funcs xe_ttm_funcs = {
1799 	.ttm_tt_create = xe_ttm_tt_create,
1800 	.ttm_tt_populate = xe_ttm_tt_populate,
1801 	.ttm_tt_unpopulate = xe_ttm_tt_unpopulate,
1802 	.ttm_tt_destroy = xe_ttm_tt_destroy,
1803 	.evict_flags = xe_evict_flags,
1804 	.move = xe_bo_move,
1805 	.io_mem_reserve = xe_ttm_io_mem_reserve,
1806 	.io_mem_pfn = xe_ttm_io_mem_pfn,
1807 	.access_memory = xe_ttm_access_memory,
1808 	.release_notify = xe_ttm_bo_release_notify,
1809 	.eviction_valuable = xe_bo_eviction_valuable,
1810 	.delete_mem_notify = xe_ttm_bo_delete_mem_notify,
1811 	.swap_notify = xe_ttm_bo_swap_notify,
1812 };
1813 
1814 static void xe_ttm_bo_destroy(struct ttm_buffer_object *ttm_bo)
1815 {
1816 	struct xe_bo *bo = ttm_to_xe_bo(ttm_bo);
1817 	struct xe_device *xe = ttm_to_xe_device(ttm_bo->bdev);
1818 	struct xe_tile *tile;
1819 	u8 id;
1820 
1821 	if (bo->ttm.base.import_attach)
1822 		drm_prime_gem_destroy(&bo->ttm.base, NULL);
1823 	drm_gem_object_release(&bo->ttm.base);
1824 
1825 	xe_assert(xe, list_empty(&ttm_bo->base.gpuva.list));
1826 
1827 	for_each_tile(tile, xe, id)
1828 		if (bo->ggtt_node[id])
1829 			xe_ggtt_remove_bo(tile->mem.ggtt, bo);
1830 
1831 #ifdef CONFIG_PROC_FS
1832 	if (bo->client)
1833 		xe_drm_client_remove_bo(bo);
1834 #endif
1835 
1836 	if (bo->vm && xe_bo_is_user(bo))
1837 		xe_vm_put(bo->vm);
1838 
1839 	if (bo->parent_obj)
1840 		xe_bo_put(bo->parent_obj);
1841 
1842 	mutex_lock(&xe->mem_access.vram_userfault.lock);
1843 	if (!list_empty(&bo->vram_userfault_link))
1844 		list_del(&bo->vram_userfault_link);
1845 	mutex_unlock(&xe->mem_access.vram_userfault.lock);
1846 
1847 	kfree(bo);
1848 }
1849 
1850 static void xe_gem_object_free(struct drm_gem_object *obj)
1851 {
1852 	/* Our BO reference counting scheme works as follows:
1853 	 *
1854 	 * The gem object kref is typically used throughout the driver,
1855 	 * and the gem object holds a ttm_buffer_object refcount, so
1856 	 * that when the last gem object reference is put, which is when
1857 	 * we end up in this function, we put also that ttm_buffer_object
1858 	 * refcount. Anything using gem interfaces is then no longer
1859 	 * allowed to access the object in a way that requires a gem
1860 	 * refcount, including locking the object.
1861 	 *
1862 	 * driver ttm callbacks is allowed to use the ttm_buffer_object
1863 	 * refcount directly if needed.
1864 	 */
1865 	__xe_bo_vunmap(gem_to_xe_bo(obj));
1866 	ttm_bo_fini(container_of(obj, struct ttm_buffer_object, base));
1867 }
1868 
1869 static void xe_gem_object_close(struct drm_gem_object *obj,
1870 				struct drm_file *file_priv)
1871 {
1872 	struct xe_bo *bo = gem_to_xe_bo(obj);
1873 
1874 	if (bo->vm && !xe_vm_in_fault_mode(bo->vm)) {
1875 		xe_assert(xe_bo_device(bo), xe_bo_is_user(bo));
1876 
1877 		xe_bo_lock(bo, false);
1878 		ttm_bo_set_bulk_move(&bo->ttm, NULL);
1879 		xe_bo_unlock(bo);
1880 	}
1881 }
1882 
1883 static bool should_migrate_to_smem(struct xe_bo *bo)
1884 {
1885 	/*
1886 	 * NOTE: The following atomic checks are platform-specific. For example,
1887 	 * if a device supports CXL atomics, these may not be necessary or
1888 	 * may behave differently.
1889 	 */
1890 
1891 	return bo->attr.atomic_access == DRM_XE_ATOMIC_GLOBAL ||
1892 	       bo->attr.atomic_access == DRM_XE_ATOMIC_CPU;
1893 }
1894 
1895 static int xe_bo_wait_usage_kernel(struct xe_bo *bo, struct ttm_operation_ctx *ctx)
1896 {
1897 	long lerr;
1898 
1899 	if (ctx->no_wait_gpu)
1900 		return dma_resv_test_signaled(bo->ttm.base.resv, DMA_RESV_USAGE_KERNEL) ?
1901 			0 : -EBUSY;
1902 
1903 	lerr = dma_resv_wait_timeout(bo->ttm.base.resv, DMA_RESV_USAGE_KERNEL,
1904 				     ctx->interruptible, MAX_SCHEDULE_TIMEOUT);
1905 	if (lerr < 0)
1906 		return lerr;
1907 	if (lerr == 0)
1908 		return -EBUSY;
1909 
1910 	return 0;
1911 }
1912 
1913 /* Populate the bo if swapped out, or migrate if the access mode requires that. */
1914 static int xe_bo_fault_migrate(struct xe_bo *bo, struct ttm_operation_ctx *ctx,
1915 			       struct drm_exec *exec)
1916 {
1917 	struct ttm_buffer_object *tbo = &bo->ttm;
1918 	int err = 0;
1919 
1920 	if (ttm_manager_type(tbo->bdev, tbo->resource->mem_type)->use_tt) {
1921 		err = xe_bo_wait_usage_kernel(bo, ctx);
1922 		if (!err)
1923 			err = ttm_bo_populate(&bo->ttm, ctx);
1924 	} else if (should_migrate_to_smem(bo)) {
1925 		xe_assert(xe_bo_device(bo), bo->flags & XE_BO_FLAG_SYSTEM);
1926 		err = xe_bo_migrate(bo, XE_PL_TT, ctx, exec);
1927 	}
1928 
1929 	return err;
1930 }
1931 
1932 /* Call into TTM to populate PTEs, and register bo for PTE removal on runtime suspend. */
1933 static vm_fault_t __xe_bo_cpu_fault(struct vm_fault *vmf, struct xe_device *xe, struct xe_bo *bo)
1934 {
1935 	vm_fault_t ret;
1936 
1937 	trace_xe_bo_cpu_fault(bo);
1938 
1939 	ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
1940 				       TTM_BO_VM_NUM_PREFAULT);
1941 	/*
1942 	 * When TTM is actually called to insert PTEs, ensure no blocking conditions
1943 	 * remain, in which case TTM may drop locks and return VM_FAULT_RETRY.
1944 	 */
1945 	xe_assert(xe, ret != VM_FAULT_RETRY);
1946 
1947 	if (ret == VM_FAULT_NOPAGE &&
1948 	    mem_type_is_vram(bo->ttm.resource->mem_type)) {
1949 		mutex_lock(&xe->mem_access.vram_userfault.lock);
1950 		if (list_empty(&bo->vram_userfault_link))
1951 			list_add(&bo->vram_userfault_link,
1952 				 &xe->mem_access.vram_userfault.list);
1953 		mutex_unlock(&xe->mem_access.vram_userfault.lock);
1954 	}
1955 
1956 	return ret;
1957 }
1958 
1959 static vm_fault_t xe_err_to_fault_t(int err)
1960 {
1961 	switch (err) {
1962 	case 0:
1963 	case -EINTR:
1964 	case -ERESTARTSYS:
1965 	case -EAGAIN:
1966 		return VM_FAULT_NOPAGE;
1967 	case -ENOMEM:
1968 	case -ENOSPC:
1969 		return VM_FAULT_OOM;
1970 	default:
1971 		break;
1972 	}
1973 	return VM_FAULT_SIGBUS;
1974 }
1975 
1976 static bool xe_ttm_bo_is_imported(struct ttm_buffer_object *tbo)
1977 {
1978 	dma_resv_assert_held(tbo->base.resv);
1979 
1980 	return tbo->ttm &&
1981 		(tbo->ttm->page_flags & (TTM_TT_FLAG_EXTERNAL | TTM_TT_FLAG_EXTERNAL_MAPPABLE)) ==
1982 		TTM_TT_FLAG_EXTERNAL;
1983 }
1984 
1985 static vm_fault_t xe_bo_cpu_fault_fastpath(struct vm_fault *vmf, struct xe_device *xe,
1986 					   struct xe_bo *bo, bool needs_rpm)
1987 {
1988 	struct ttm_buffer_object *tbo = &bo->ttm;
1989 	vm_fault_t ret = VM_FAULT_RETRY;
1990 	struct xe_validation_ctx ctx;
1991 	struct ttm_operation_ctx tctx = {
1992 		.interruptible = true,
1993 		.no_wait_gpu = true,
1994 		.gfp_retry_mayfail = true,
1995 
1996 	};
1997 	int err;
1998 
1999 	if (needs_rpm && !xe_pm_runtime_get_if_active(xe))
2000 		return VM_FAULT_RETRY;
2001 
2002 	err = xe_validation_ctx_init(&ctx, &xe->val, NULL,
2003 				     (struct xe_val_flags) {
2004 					     .interruptible = true,
2005 					     .no_block = true
2006 				     });
2007 	if (err)
2008 		goto out_pm;
2009 
2010 	if (!dma_resv_trylock(tbo->base.resv))
2011 		goto out_validation;
2012 
2013 	/*
2014 	 * Reject CPU faults to purgeable BOs. DONTNEED BOs can be purged
2015 	 * at any time, and purged BOs have no backing store. Either case
2016 	 * is undefined behavior for CPU access.
2017 	 */
2018 	if (xe_bo_madv_is_dontneed(bo) || xe_bo_is_purged(bo)) {
2019 		ret = VM_FAULT_SIGBUS;
2020 		goto out_unlock;
2021 	}
2022 
2023 	if (xe_ttm_bo_is_imported(tbo)) {
2024 		ret = VM_FAULT_SIGBUS;
2025 		drm_dbg(&xe->drm, "CPU trying to access an imported buffer object.\n");
2026 		goto out_unlock;
2027 	}
2028 
2029 	err = xe_bo_fault_migrate(bo, &tctx, NULL);
2030 	if (err) {
2031 		/* Return VM_FAULT_RETRY on these errors. */
2032 		if (err != -ENOMEM && err != -ENOSPC && err != -EBUSY)
2033 			ret = xe_err_to_fault_t(err);
2034 		goto out_unlock;
2035 	}
2036 
2037 	if (dma_resv_test_signaled(bo->ttm.base.resv, DMA_RESV_USAGE_KERNEL))
2038 		ret = __xe_bo_cpu_fault(vmf, xe, bo);
2039 
2040 out_unlock:
2041 	dma_resv_unlock(tbo->base.resv);
2042 out_validation:
2043 	xe_validation_ctx_fini(&ctx);
2044 out_pm:
2045 	if (needs_rpm)
2046 		xe_pm_runtime_put(xe);
2047 
2048 	return ret;
2049 }
2050 
2051 static vm_fault_t xe_bo_cpu_fault(struct vm_fault *vmf)
2052 {
2053 	struct ttm_buffer_object *tbo = vmf->vma->vm_private_data;
2054 	struct drm_device *ddev = tbo->base.dev;
2055 	struct xe_device *xe = to_xe_device(ddev);
2056 	struct xe_bo *bo = ttm_to_xe_bo(tbo);
2057 	bool needs_rpm = bo->flags & XE_BO_FLAG_VRAM_MASK;
2058 	bool retry_after_wait = false;
2059 	struct xe_validation_ctx ctx;
2060 	struct drm_exec exec;
2061 	vm_fault_t ret;
2062 	int err = 0;
2063 	int idx;
2064 
2065 	if (xe_device_wedged(xe) || !drm_dev_enter(&xe->drm, &idx))
2066 		return ttm_bo_vm_dummy_page(vmf, vmf->vma->vm_page_prot);
2067 
2068 	ret = xe_bo_cpu_fault_fastpath(vmf, xe, bo, needs_rpm);
2069 	if (ret != VM_FAULT_RETRY)
2070 		goto out;
2071 
2072 	if (fault_flag_allow_retry_first(vmf->flags)) {
2073 		if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
2074 			goto out;
2075 		retry_after_wait = true;
2076 		xe_bo_get(bo);
2077 		mmap_read_unlock(vmf->vma->vm_mm);
2078 	} else {
2079 		ret = VM_FAULT_NOPAGE;
2080 	}
2081 
2082 	/*
2083 	 * The fastpath failed and we were not required to return and retry immediately.
2084 	 * We're now running in one of two modes:
2085 	 *
2086 	 * 1) retry_after_wait == true: The mmap_read_lock() is dropped, and we're trying
2087 	 * to resolve blocking waits. But we can't resolve the fault since the
2088 	 * mmap_read_lock() is dropped. After retrying the fault, the aim is that the fastpath
2089 	 * should succeed. But it may fail since we drop the bo lock.
2090 	 *
2091 	 * 2) retry_after_wait == false: The fastpath failed, typically even after
2092 	 * a retry. Do whatever's necessary to resolve the fault.
2093 	 *
2094 	 * This construct is recommended to avoid excessive waits under the mmap_lock.
2095 	 */
2096 
2097 	if (needs_rpm)
2098 		xe_pm_runtime_get(xe);
2099 
2100 	xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {.interruptible = true},
2101 			    err) {
2102 		struct ttm_operation_ctx tctx = {
2103 			.interruptible = true,
2104 			.no_wait_gpu = false,
2105 			.gfp_retry_mayfail = retry_after_wait,
2106 		};
2107 
2108 		err = drm_exec_lock_obj(&exec, &tbo->base);
2109 		drm_exec_retry_on_contention(&exec);
2110 		if (err)
2111 			break;
2112 
2113 		/*
2114 		 * Reject CPU faults to purgeable BOs. DONTNEED BOs can be
2115 		 * purged at any time, and purged BOs have no backing store.
2116 		 */
2117 		if (xe_bo_madv_is_dontneed(bo) || xe_bo_is_purged(bo)) {
2118 			err = -EFAULT;
2119 			break;
2120 		}
2121 
2122 		if (xe_ttm_bo_is_imported(tbo)) {
2123 			err = -EFAULT;
2124 			drm_dbg(&xe->drm, "CPU trying to access an imported buffer object.\n");
2125 			break;
2126 		}
2127 
2128 		err = xe_bo_fault_migrate(bo, &tctx, &exec);
2129 		if (err) {
2130 			drm_exec_retry_on_contention(&exec);
2131 			xe_validation_retry_on_oom(&ctx, &err);
2132 			break;
2133 		}
2134 
2135 		err = xe_bo_wait_usage_kernel(bo, &tctx);
2136 		if (err)
2137 			break;
2138 
2139 		if (!retry_after_wait)
2140 			ret = __xe_bo_cpu_fault(vmf, xe, bo);
2141 	}
2142 	/* if retry_after_wait == true, we *must* return VM_FAULT_RETRY. */
2143 	if (err && !retry_after_wait)
2144 		ret = xe_err_to_fault_t(err);
2145 
2146 	if (needs_rpm)
2147 		xe_pm_runtime_put(xe);
2148 
2149 	if (retry_after_wait)
2150 		xe_bo_put(bo);
2151 out:
2152 	drm_dev_exit(idx);
2153 
2154 	return ret;
2155 }
2156 
2157 static int xe_bo_vm_access(struct vm_area_struct *vma, unsigned long addr,
2158 			   void *buf, int len, int write)
2159 {
2160 	struct ttm_buffer_object *ttm_bo = vma->vm_private_data;
2161 	struct xe_bo *bo = ttm_to_xe_bo(ttm_bo);
2162 	struct xe_device *xe = xe_bo_device(bo);
2163 
2164 	guard(xe_pm_runtime)(xe);
2165 	return ttm_bo_vm_access(vma, addr, buf, len, write);
2166 }
2167 
2168 /**
2169  * xe_bo_read() - Read from an xe_bo
2170  * @bo: The buffer object to read from.
2171  * @offset: The byte offset to start reading from.
2172  * @dst: Location to store the read.
2173  * @size: Size in bytes for the read.
2174  *
2175  * Read @size bytes from the @bo, starting from @offset, storing into @dst.
2176  *
2177  * Return: Zero on success, or negative error.
2178  */
2179 int xe_bo_read(struct xe_bo *bo, u64 offset, void *dst, int size)
2180 {
2181 	int ret;
2182 
2183 	ret = ttm_bo_access(&bo->ttm, offset, dst, size, 0);
2184 	if (ret >= 0 && ret != size)
2185 		ret = -EIO;
2186 	else if (ret == size)
2187 		ret = 0;
2188 
2189 	return ret;
2190 }
2191 
2192 static const struct vm_operations_struct xe_gem_vm_ops = {
2193 	.fault = xe_bo_cpu_fault,
2194 	.open = ttm_bo_vm_open,
2195 	.close = ttm_bo_vm_close,
2196 	.access = xe_bo_vm_access,
2197 };
2198 
2199 static int xe_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma)
2200 {
2201 	struct xe_bo *bo = gem_to_xe_bo(obj);
2202 	int err = 0;
2203 
2204 	/*
2205 	 * Reject mmap of purgeable BOs. DONTNEED BOs can be purged
2206 	 * at any time, making CPU access undefined behavior. Purged BOs have
2207 	 * no backing store and are permanently invalid.
2208 	 */
2209 	err = xe_bo_lock(bo, true);
2210 	if (err)
2211 		return err;
2212 
2213 	if (xe_bo_madv_is_dontneed(bo))
2214 		err = -EBUSY;
2215 	else if (xe_bo_is_purged(bo))
2216 		err = -EINVAL;
2217 	xe_bo_unlock(bo);
2218 	if (err)
2219 		return err;
2220 
2221 	return drm_gem_ttm_mmap(obj, vma);
2222 }
2223 
2224 static const struct drm_gem_object_funcs xe_gem_object_funcs = {
2225 	.free = xe_gem_object_free,
2226 	.close = xe_gem_object_close,
2227 	.mmap = xe_gem_object_mmap,
2228 	.export = xe_gem_prime_export,
2229 	.vm_ops = &xe_gem_vm_ops,
2230 };
2231 
2232 /**
2233  * xe_bo_alloc - Allocate storage for a struct xe_bo
2234  *
2235  * This function is intended to allocate storage to be used for input
2236  * to __xe_bo_create_locked(), in the case a pointer to the bo to be
2237  * created is needed before the call to __xe_bo_create_locked().
2238  * If __xe_bo_create_locked ends up never to be called, then the
2239  * storage allocated with this function needs to be freed using
2240  * xe_bo_free().
2241  *
2242  * Return: A pointer to an uninitialized struct xe_bo on success,
2243  * ERR_PTR(-ENOMEM) on error.
2244  */
2245 struct xe_bo *xe_bo_alloc(void)
2246 {
2247 	struct xe_bo *bo = kzalloc_obj(*bo);
2248 
2249 	if (!bo)
2250 		return ERR_PTR(-ENOMEM);
2251 
2252 	return bo;
2253 }
2254 
2255 /**
2256  * xe_bo_free - Free storage allocated using xe_bo_alloc()
2257  * @bo: The buffer object storage.
2258  *
2259  * Refer to xe_bo_alloc() documentation for valid use-cases.
2260  */
2261 void xe_bo_free(struct xe_bo *bo)
2262 {
2263 	kfree(bo);
2264 }
2265 
2266 /**
2267  * xe_bo_init_locked() - Initialize or create an xe_bo.
2268  * @xe: The xe device.
2269  * @bo: An already allocated buffer object or NULL
2270  * if the function should allocate a new one.
2271  * @tile: The tile to select for migration of this bo, and the tile used for
2272  * GGTT binding if any. Only to be non-NULL for ttm_bo_type_kernel bos.
2273  * @resv: Pointer to a locked shared reservation object to use for this bo,
2274  * or NULL for the xe_bo to use its own.
2275  * @bulk: The bulk move to use for LRU bumping, or NULL for external bos.
2276  * @size: The storage size to use for the bo.
2277  * @cpu_caching: The cpu caching used for system memory backing store.
2278  * @type: The TTM buffer object type.
2279  * @flags: XE_BO_FLAG_ flags.
2280  * @exec: The drm_exec transaction to use for exhaustive eviction.
2281  *
2282  * Initialize or create an xe buffer object. On failure, any allocated buffer
2283  * object passed in @bo will have been unreferenced.
2284  *
2285  * Return: The buffer object on success. Negative error pointer on failure.
2286  */
2287 struct xe_bo *xe_bo_init_locked(struct xe_device *xe, struct xe_bo *bo,
2288 				struct xe_tile *tile, struct dma_resv *resv,
2289 				struct ttm_lru_bulk_move *bulk, size_t size,
2290 				u16 cpu_caching, enum ttm_bo_type type,
2291 				u32 flags, struct drm_exec *exec)
2292 {
2293 	struct ttm_operation_ctx ctx = {
2294 		.interruptible = true,
2295 		.no_wait_gpu = false,
2296 		.gfp_retry_mayfail = true,
2297 	};
2298 	struct ttm_placement *placement;
2299 	uint32_t alignment;
2300 	size_t aligned_size;
2301 	int err;
2302 
2303 	/* Only kernel objects should set GT */
2304 	xe_assert(xe, !tile || type == ttm_bo_type_kernel);
2305 
2306 	if (XE_WARN_ON(!size)) {
2307 		xe_bo_free(bo);
2308 		return ERR_PTR(-EINVAL);
2309 	}
2310 
2311 	/* XE_BO_FLAG_GGTTx requires XE_BO_FLAG_GGTT also be set */
2312 	if ((flags & XE_BO_FLAG_GGTT_ALL) && !(flags & XE_BO_FLAG_GGTT)) {
2313 		xe_bo_free(bo);
2314 		return ERR_PTR(-EINVAL);
2315 	}
2316 
2317 	if (flags & (XE_BO_FLAG_VRAM_MASK | XE_BO_FLAG_STOLEN) &&
2318 	    !(flags & XE_BO_FLAG_IGNORE_MIN_PAGE_SIZE) &&
2319 	    ((xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K) ||
2320 	     (flags & (XE_BO_FLAG_NEEDS_64K | XE_BO_FLAG_NEEDS_2M)))) {
2321 		size_t align = flags & XE_BO_FLAG_NEEDS_2M ? SZ_2M : SZ_64K;
2322 
2323 		aligned_size = ALIGN(size, align);
2324 		if (type != ttm_bo_type_device)
2325 			size = ALIGN(size, align);
2326 		flags |= XE_BO_FLAG_INTERNAL_64K;
2327 		alignment = align >> PAGE_SHIFT;
2328 	} else {
2329 		aligned_size = ALIGN(size, SZ_4K);
2330 		flags &= ~XE_BO_FLAG_INTERNAL_64K;
2331 		alignment = SZ_4K >> PAGE_SHIFT;
2332 	}
2333 
2334 	if (type == ttm_bo_type_device && aligned_size != size) {
2335 		xe_bo_free(bo);
2336 		return ERR_PTR(-EINVAL);
2337 	}
2338 
2339 	if (!bo) {
2340 		bo = xe_bo_alloc();
2341 		if (IS_ERR(bo))
2342 			return bo;
2343 	}
2344 
2345 	bo->ccs_cleared = false;
2346 	bo->tile = tile;
2347 	bo->flags = flags;
2348 	bo->cpu_caching = cpu_caching;
2349 	bo->ttm.base.funcs = &xe_gem_object_funcs;
2350 	bo->ttm.priority = XE_BO_PRIORITY_NORMAL;
2351 	INIT_LIST_HEAD(&bo->pinned_link);
2352 #ifdef CONFIG_PROC_FS
2353 	INIT_LIST_HEAD(&bo->client_link);
2354 #endif
2355 	INIT_LIST_HEAD(&bo->vram_userfault_link);
2356 
2357 	/* Initialize purge advisory state */
2358 	bo->madv_purgeable = XE_MADV_PURGEABLE_WILLNEED;
2359 
2360 	drm_gem_private_object_init(&xe->drm, &bo->ttm.base, size);
2361 
2362 	if (resv) {
2363 		ctx.allow_res_evict = !(flags & XE_BO_FLAG_NO_RESV_EVICT);
2364 		ctx.resv = resv;
2365 	}
2366 
2367 	xe_validation_assert_exec(xe, exec, &bo->ttm.base);
2368 	if (!(flags & XE_BO_FLAG_FIXED_PLACEMENT)) {
2369 		err = __xe_bo_placement_for_flags(xe, bo, bo->flags, type);
2370 		if (WARN_ON(err)) {
2371 			xe_ttm_bo_destroy(&bo->ttm);
2372 			return ERR_PTR(err);
2373 		}
2374 	}
2375 
2376 	/* Defer populating type_sg bos */
2377 	placement = (type == ttm_bo_type_sg ||
2378 		     bo->flags & XE_BO_FLAG_DEFER_BACKING) ? &sys_placement :
2379 		&bo->placement;
2380 	err = ttm_bo_init_reserved(&xe->ttm, &bo->ttm, type,
2381 				   placement, alignment,
2382 				   &ctx, NULL, resv, xe_ttm_bo_destroy);
2383 	if (err)
2384 		return ERR_PTR(err);
2385 
2386 	/*
2387 	 * The VRAM pages underneath are potentially still being accessed by the
2388 	 * GPU, as per async GPU clearing and async evictions. However TTM makes
2389 	 * sure to add any corresponding move/clear fences into the objects
2390 	 * dma-resv using the DMA_RESV_USAGE_KERNEL slot.
2391 	 *
2392 	 * For KMD internal buffers we don't care about GPU clearing, however we
2393 	 * still need to handle async evictions, where the VRAM is still being
2394 	 * accessed by the GPU. Most internal callers are not expecting this,
2395 	 * since they are missing the required synchronisation before accessing
2396 	 * the memory. To keep things simple just sync wait any kernel fences
2397 	 * here, if the buffer is designated KMD internal.
2398 	 *
2399 	 * For normal userspace objects we should already have the required
2400 	 * pipelining or sync waiting elsewhere, since we already have to deal
2401 	 * with things like async GPU clearing.
2402 	 */
2403 	if (type == ttm_bo_type_kernel) {
2404 		long timeout = dma_resv_wait_timeout(bo->ttm.base.resv,
2405 						     DMA_RESV_USAGE_KERNEL,
2406 						     ctx.interruptible,
2407 						     MAX_SCHEDULE_TIMEOUT);
2408 
2409 		if (timeout < 0) {
2410 			if (!resv)
2411 				dma_resv_unlock(bo->ttm.base.resv);
2412 			xe_bo_put(bo);
2413 			return ERR_PTR(timeout);
2414 		}
2415 	}
2416 
2417 	bo->created = true;
2418 	if (bulk)
2419 		ttm_bo_set_bulk_move(&bo->ttm, bulk);
2420 	else
2421 		ttm_bo_move_to_lru_tail_unlocked(&bo->ttm);
2422 
2423 	return bo;
2424 }
2425 
2426 static int __xe_bo_fixed_placement(struct xe_device *xe,
2427 				   struct xe_bo *bo, enum ttm_bo_type type,
2428 				   u32 flags,
2429 				   u64 start, u64 end, u64 size)
2430 {
2431 	struct ttm_place *place = bo->placements;
2432 	u32 vram_flag, vram_stolen_flags;
2433 
2434 	/*
2435 	 * to allow fixed placement in GGTT of a VF, post-migration fixups would have to
2436 	 * include selecting a new fixed offset and shifting the page ranges for it
2437 	 */
2438 	xe_assert(xe, !IS_SRIOV_VF(xe) || !(bo->flags & XE_BO_FLAG_GGTT));
2439 
2440 	if (flags & (XE_BO_FLAG_USER | XE_BO_FLAG_SYSTEM))
2441 		return -EINVAL;
2442 
2443 	vram_flag = flags & XE_BO_FLAG_VRAM_MASK;
2444 	vram_stolen_flags = (flags & (XE_BO_FLAG_STOLEN)) | vram_flag;
2445 
2446 	/* check if more than one VRAM/STOLEN flag is set */
2447 	if (hweight32(vram_stolen_flags) > 1)
2448 		return -EINVAL;
2449 
2450 	place->flags = TTM_PL_FLAG_CONTIGUOUS;
2451 	place->fpfn = start >> PAGE_SHIFT;
2452 	place->lpfn = end >> PAGE_SHIFT;
2453 
2454 	if (flags & XE_BO_FLAG_STOLEN)
2455 		place->mem_type = XE_PL_STOLEN;
2456 	else
2457 		place->mem_type = bo_vram_flags_to_vram_placement(xe, flags, vram_flag, type);
2458 
2459 	bo->placement = (struct ttm_placement) {
2460 		.num_placement = 1,
2461 		.placement = place,
2462 	};
2463 
2464 	return 0;
2465 }
2466 
2467 static struct xe_bo *
2468 __xe_bo_create_locked(struct xe_device *xe,
2469 		      struct xe_tile *tile, struct xe_vm *vm,
2470 		      size_t size, u64 start, u64 end,
2471 		      u16 cpu_caching, enum ttm_bo_type type, u32 flags,
2472 		      u64 alignment, struct drm_exec *exec)
2473 {
2474 	struct xe_bo *bo = NULL;
2475 	int err;
2476 
2477 	if (vm)
2478 		xe_vm_assert_held(vm);
2479 
2480 	if (start || end != ~0ULL) {
2481 		bo = xe_bo_alloc();
2482 		if (IS_ERR(bo))
2483 			return bo;
2484 
2485 		flags |= XE_BO_FLAG_FIXED_PLACEMENT;
2486 		err = __xe_bo_fixed_placement(xe, bo, type, flags, start, end, size);
2487 		if (err) {
2488 			xe_bo_free(bo);
2489 			return ERR_PTR(err);
2490 		}
2491 	}
2492 
2493 	bo = xe_bo_init_locked(xe, bo, tile, vm ? xe_vm_resv(vm) : NULL,
2494 			       vm && !xe_vm_in_fault_mode(vm) &&
2495 			       flags & XE_BO_FLAG_USER ?
2496 			       &vm->lru_bulk_move : NULL, size,
2497 			       cpu_caching, type, flags, exec);
2498 	if (IS_ERR(bo))
2499 		return bo;
2500 
2501 	bo->min_align = alignment;
2502 
2503 	/*
2504 	 * Note that instead of taking a reference no the drm_gpuvm_resv_bo(),
2505 	 * to ensure the shared resv doesn't disappear under the bo, the bo
2506 	 * will keep a reference to the vm, and avoid circular references
2507 	 * by having all the vm's bo refereferences released at vm close
2508 	 * time.
2509 	 */
2510 	if (vm && xe_bo_is_user(bo))
2511 		xe_vm_get(vm);
2512 	bo->vm = vm;
2513 
2514 	if (bo->flags & XE_BO_FLAG_GGTT) {
2515 		struct xe_tile *t;
2516 		u8 id;
2517 
2518 		if (!(bo->flags & XE_BO_FLAG_GGTT_ALL)) {
2519 			if (!tile && flags & XE_BO_FLAG_STOLEN)
2520 				tile = xe_device_get_root_tile(xe);
2521 
2522 			xe_assert(xe, tile);
2523 		}
2524 
2525 		for_each_tile(t, xe, id) {
2526 			if (t != tile && !(bo->flags & XE_BO_FLAG_GGTTx(t)))
2527 				continue;
2528 
2529 			if (flags & XE_BO_FLAG_FIXED_PLACEMENT) {
2530 				err = xe_ggtt_insert_bo_at(t->mem.ggtt, bo,
2531 							   start + xe_bo_size(bo), U64_MAX,
2532 							   exec);
2533 			} else {
2534 				err = xe_ggtt_insert_bo(t->mem.ggtt, bo, exec);
2535 			}
2536 			if (err)
2537 				goto err_unlock_put_bo;
2538 		}
2539 	}
2540 
2541 	trace_xe_bo_create(bo);
2542 	return bo;
2543 
2544 err_unlock_put_bo:
2545 	__xe_bo_unset_bulk_move(bo);
2546 	xe_bo_unlock_vm_held(bo);
2547 	xe_bo_put(bo);
2548 	return ERR_PTR(err);
2549 }
2550 
2551 /**
2552  * xe_bo_create_locked() - Create a BO
2553  * @xe: The xe device.
2554  * @tile: The tile to select for migration of this bo, and the tile used for
2555  * GGTT binding if any. Only to be non-NULL for ttm_bo_type_kernel bos.
2556  * @vm: The local vm or NULL for external objects.
2557  * @size: The storage size to use for the bo.
2558  * @type: The TTM buffer object type.
2559  * @flags: XE_BO_FLAG_ flags.
2560  * @exec: The drm_exec transaction to use for exhaustive eviction.
2561  *
2562  * Create a locked xe BO with no range- nor alignment restrictions.
2563  *
2564  * Return: The buffer object on success. Negative error pointer on failure.
2565  */
2566 struct xe_bo *xe_bo_create_locked(struct xe_device *xe, struct xe_tile *tile,
2567 				  struct xe_vm *vm, size_t size,
2568 				  enum ttm_bo_type type, u32 flags,
2569 				  struct drm_exec *exec)
2570 {
2571 	return __xe_bo_create_locked(xe, tile, vm, size, 0, ~0ULL, 0, type,
2572 				     flags, 0, exec);
2573 }
2574 
2575 static struct xe_bo *xe_bo_create_novm(struct xe_device *xe, struct xe_tile *tile,
2576 				       size_t size, u16 cpu_caching,
2577 				       enum ttm_bo_type type, u32 flags,
2578 				       u64 alignment, bool intr)
2579 {
2580 	struct xe_validation_ctx ctx;
2581 	struct drm_exec exec;
2582 	struct xe_bo *bo;
2583 	int ret = 0;
2584 
2585 	xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {.interruptible = intr},
2586 			    ret) {
2587 		bo = __xe_bo_create_locked(xe, tile, NULL, size, 0, ~0ULL,
2588 					   cpu_caching, type, flags, alignment, &exec);
2589 		drm_exec_retry_on_contention(&exec);
2590 		if (IS_ERR(bo)) {
2591 			ret = PTR_ERR(bo);
2592 			xe_validation_retry_on_oom(&ctx, &ret);
2593 		} else {
2594 			xe_bo_unlock(bo);
2595 		}
2596 	}
2597 
2598 	return ret ? ERR_PTR(ret) : bo;
2599 }
2600 
2601 /**
2602  * xe_bo_create_user() - Create a user BO
2603  * @xe: The xe device.
2604  * @vm: The local vm or NULL for external objects.
2605  * @size: The storage size to use for the bo.
2606  * @cpu_caching: The caching mode to be used for system backing store.
2607  * @flags: XE_BO_FLAG_ flags.
2608  * @exec: The drm_exec transaction to use for exhaustive eviction, or NULL
2609  * if such a transaction should be initiated by the call.
2610  *
2611  * Create a bo on behalf of user-space.
2612  *
2613  * Return: The buffer object on success. Negative error pointer on failure.
2614  */
2615 struct xe_bo *xe_bo_create_user(struct xe_device *xe,
2616 				struct xe_vm *vm, size_t size,
2617 				u16 cpu_caching,
2618 				u32 flags, struct drm_exec *exec)
2619 {
2620 	struct xe_bo *bo;
2621 
2622 	flags |= XE_BO_FLAG_USER;
2623 
2624 	if (vm || exec) {
2625 		xe_assert(xe, exec);
2626 		bo = __xe_bo_create_locked(xe, NULL, vm, size, 0, ~0ULL,
2627 					   cpu_caching, ttm_bo_type_device,
2628 					   flags, 0, exec);
2629 		if (!IS_ERR(bo))
2630 			xe_bo_unlock_vm_held(bo);
2631 	} else {
2632 		bo = xe_bo_create_novm(xe, NULL, size, cpu_caching,
2633 				       ttm_bo_type_device, flags, 0, true);
2634 	}
2635 
2636 	return bo;
2637 }
2638 
2639 /**
2640  * xe_bo_create_pin_range_novm() - Create and pin a BO with range options.
2641  * @xe: The xe device.
2642  * @tile: The tile to select for migration of this bo, and the tile used for
2643  * GGTT binding if any. Only to be non-NULL for ttm_bo_type_kernel bos.
2644  * @size: The storage size to use for the bo.
2645  * @start: Start of fixed VRAM range or 0.
2646  * @end: End of fixed VRAM range or ~0ULL.
2647  * @type: The TTM buffer object type.
2648  * @flags: XE_BO_FLAG_ flags.
2649  *
2650  * Create an Xe BO with range- and options. If @start and @end indicate
2651  * a fixed VRAM range, this must be a ttm_bo_type_kernel bo with VRAM placement
2652  * only.
2653  *
2654  * Return: The buffer object on success. Negative error pointer on failure.
2655  */
2656 struct xe_bo *xe_bo_create_pin_range_novm(struct xe_device *xe, struct xe_tile *tile,
2657 					  size_t size, u64 start, u64 end,
2658 					  enum ttm_bo_type type, u32 flags)
2659 {
2660 	struct xe_validation_ctx ctx;
2661 	struct drm_exec exec;
2662 	struct xe_bo *bo;
2663 	int err = 0;
2664 
2665 	xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {}, err) {
2666 		bo = __xe_bo_create_locked(xe, tile, NULL, size, start, end,
2667 					   0, type, flags, 0, &exec);
2668 		if (IS_ERR(bo)) {
2669 			drm_exec_retry_on_contention(&exec);
2670 			err = PTR_ERR(bo);
2671 			xe_validation_retry_on_oom(&ctx, &err);
2672 			break;
2673 		}
2674 
2675 		err = xe_bo_pin(bo, &exec);
2676 		xe_bo_unlock(bo);
2677 		if (err) {
2678 			xe_bo_put(bo);
2679 			drm_exec_retry_on_contention(&exec);
2680 			xe_validation_retry_on_oom(&ctx, &err);
2681 			break;
2682 		}
2683 	}
2684 
2685 	return err ? ERR_PTR(err) : bo;
2686 }
2687 
2688 static struct xe_bo *xe_bo_create_pin_map_at_aligned(struct xe_device *xe,
2689 						     struct xe_tile *tile,
2690 						     struct xe_vm *vm,
2691 						     size_t size, u64 offset,
2692 						     enum ttm_bo_type type, u32 flags,
2693 						     u64 alignment, struct drm_exec *exec)
2694 {
2695 	struct xe_bo *bo;
2696 	int err;
2697 	u64 start = offset == ~0ull ? 0 : offset;
2698 	u64 end = offset == ~0ull ? ~0ull : start + size;
2699 
2700 	if (flags & XE_BO_FLAG_STOLEN &&
2701 	    xe_ttm_stolen_cpu_access_needs_ggtt(xe))
2702 		flags |= XE_BO_FLAG_GGTT;
2703 
2704 	bo = __xe_bo_create_locked(xe, tile, vm, size, start, end, 0, type,
2705 				   flags | XE_BO_FLAG_NEEDS_CPU_ACCESS | XE_BO_FLAG_PINNED,
2706 				   alignment, exec);
2707 	if (IS_ERR(bo))
2708 		return bo;
2709 
2710 	err = xe_bo_pin(bo, exec);
2711 	if (err)
2712 		goto err_put;
2713 
2714 	err = xe_bo_vmap(bo);
2715 	if (err)
2716 		goto err_unpin;
2717 
2718 	xe_bo_unlock_vm_held(bo);
2719 
2720 	return bo;
2721 
2722 err_unpin:
2723 	xe_bo_unpin(bo);
2724 err_put:
2725 	xe_bo_unlock_vm_held(bo);
2726 	xe_bo_put(bo);
2727 	return ERR_PTR(err);
2728 }
2729 
2730 /**
2731  * xe_bo_create_pin_map_at_novm() - Create pinned and mapped bo at optional VRAM offset
2732  * @xe: The xe device.
2733  * @tile: The tile to select for migration of this bo, and the tile used for
2734  * GGTT binding if any. Only to be non-NULL for ttm_bo_type_kernel bos.
2735  * @size: The storage size to use for the bo.
2736  * @offset: Optional VRAM offset or %~0ull for don't care.
2737  * @type: The TTM buffer object type.
2738  * @flags: XE_BO_FLAG_ flags.
2739  * @alignment: GGTT alignment.
2740  * @intr: Whether to execute any waits for backing store interruptible.
2741  *
2742  * Create a pinned and optionally mapped bo with VRAM offset and GGTT alignment
2743  * options. The bo will be external and not associated with a VM.
2744  *
2745  * Return: The buffer object on success. Negative error pointer on failure.
2746  * In particular, the function may return ERR_PTR(%-EINTR) if @intr was set
2747  * to true on entry.
2748  */
2749 struct xe_bo *
2750 xe_bo_create_pin_map_at_novm(struct xe_device *xe, struct xe_tile *tile,
2751 			     size_t size, u64 offset, enum ttm_bo_type type, u32 flags,
2752 			     u64 alignment, bool intr)
2753 {
2754 	struct xe_validation_ctx ctx;
2755 	struct drm_exec exec;
2756 	struct xe_bo *bo;
2757 	int ret = 0;
2758 
2759 	xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {.interruptible = intr},
2760 			    ret) {
2761 		bo = xe_bo_create_pin_map_at_aligned(xe, tile, NULL, size, offset,
2762 						     type, flags, alignment, &exec);
2763 		if (IS_ERR(bo)) {
2764 			drm_exec_retry_on_contention(&exec);
2765 			ret = PTR_ERR(bo);
2766 			xe_validation_retry_on_oom(&ctx, &ret);
2767 		}
2768 	}
2769 
2770 	return ret ? ERR_PTR(ret) : bo;
2771 }
2772 
2773 /**
2774  * xe_bo_create_pin_map() - Create pinned and mapped bo
2775  * @xe: The xe device.
2776  * @tile: The tile to select for migration of this bo, and the tile used for
2777  * @vm: The vm to associate the buffer object with. The vm's resv must be locked
2778  * with the transaction represented by @exec.
2779  * GGTT binding if any. Only to be non-NULL for ttm_bo_type_kernel bos.
2780  * @size: The storage size to use for the bo.
2781  * @type: The TTM buffer object type.
2782  * @flags: XE_BO_FLAG_ flags.
2783  * @exec: The drm_exec transaction to use for exhaustive eviction, and
2784  * previously used for locking @vm's resv.
2785  *
2786  * Create a pinned and mapped bo. The bo will be external and not associated
2787  * with a VM.
2788  *
2789  * Return: The buffer object on success. Negative error pointer on failure.
2790  * In particular, the function may return ERR_PTR(%-EINTR) if @exec was
2791  * configured for interruptible locking.
2792  */
2793 struct xe_bo *xe_bo_create_pin_map(struct xe_device *xe, struct xe_tile *tile,
2794 				   struct xe_vm *vm, size_t size,
2795 				   enum ttm_bo_type type, u32 flags,
2796 				   struct drm_exec *exec)
2797 {
2798 	return xe_bo_create_pin_map_at_aligned(xe, tile, vm, size, ~0ull, type, flags,
2799 					       0, exec);
2800 }
2801 
2802 /**
2803  * xe_bo_create_pin_map_novm() - Create pinned and mapped bo
2804  * @xe: The xe device.
2805  * @tile: The tile to select for migration of this bo, and the tile used for
2806  * GGTT binding if any. Only to be non-NULL for ttm_bo_type_kernel bos.
2807  * @size: The storage size to use for the bo.
2808  * @type: The TTM buffer object type.
2809  * @flags: XE_BO_FLAG_ flags.
2810  * @intr: Whether to execute any waits for backing store interruptible.
2811  *
2812  * Create a pinned and mapped bo. The bo will be external and not associated
2813  * with a VM.
2814  *
2815  * Return: The buffer object on success. Negative error pointer on failure.
2816  * In particular, the function may return ERR_PTR(%-EINTR) if @intr was set
2817  * to true on entry.
2818  */
2819 struct xe_bo *xe_bo_create_pin_map_novm(struct xe_device *xe, struct xe_tile *tile,
2820 					size_t size, enum ttm_bo_type type, u32 flags,
2821 					bool intr)
2822 {
2823 	return xe_bo_create_pin_map_at_novm(xe, tile, size, ~0ull, type, flags, 0, intr);
2824 }
2825 
2826 static void __xe_bo_unpin_map_no_vm(void *arg)
2827 {
2828 	xe_bo_unpin_map_no_vm(arg);
2829 }
2830 
2831 struct xe_bo *xe_managed_bo_create_pin_map(struct xe_device *xe, struct xe_tile *tile,
2832 					   size_t size, u32 flags)
2833 {
2834 	struct xe_bo *bo;
2835 	int ret;
2836 
2837 	KUNIT_STATIC_STUB_REDIRECT(xe_managed_bo_create_pin_map, xe, tile, size, flags);
2838 	bo = xe_bo_create_pin_map_novm(xe, tile, size, ttm_bo_type_kernel, flags, true);
2839 	if (IS_ERR(bo))
2840 		return bo;
2841 
2842 	ret = devm_add_action_or_reset(xe->drm.dev, __xe_bo_unpin_map_no_vm, bo);
2843 	if (ret)
2844 		return ERR_PTR(ret);
2845 
2846 	return bo;
2847 }
2848 
2849 void xe_managed_bo_unpin_map_no_vm(struct xe_bo *bo)
2850 {
2851 	devm_release_action(xe_bo_device(bo)->drm.dev, __xe_bo_unpin_map_no_vm, bo);
2852 }
2853 
2854 struct xe_bo *xe_managed_bo_create_from_data(struct xe_device *xe, struct xe_tile *tile,
2855 					     const void *data, size_t size, u32 flags)
2856 {
2857 	struct xe_bo *bo = xe_managed_bo_create_pin_map(xe, tile, ALIGN(size, PAGE_SIZE), flags);
2858 
2859 	if (IS_ERR(bo))
2860 		return bo;
2861 
2862 	xe_map_memcpy_to(xe, &bo->vmap, 0, data, size);
2863 
2864 	return bo;
2865 }
2866 
2867 /**
2868  * xe_managed_bo_reinit_in_vram
2869  * @xe: xe device
2870  * @tile: Tile where the new buffer will be created
2871  * @src: Managed buffer object allocated in system memory
2872  *
2873  * Replace a managed src buffer object allocated in system memory with a new
2874  * one allocated in vram, copying the data between them.
2875  * Buffer object in VRAM is not going to have the same GGTT address, the caller
2876  * is responsible for making sure that any old references to it are updated.
2877  *
2878  * Returns 0 for success, negative error code otherwise.
2879  */
2880 int xe_managed_bo_reinit_in_vram(struct xe_device *xe, struct xe_tile *tile, struct xe_bo **src)
2881 {
2882 	struct xe_bo *bo;
2883 	u32 dst_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT;
2884 
2885 	dst_flags |= (*src)->flags & (XE_BO_FLAG_GGTT_INVALIDATE |
2886 				      XE_BO_FLAG_PINNED_NORESTORE);
2887 
2888 	xe_assert(xe, IS_DGFX(xe));
2889 	xe_assert(xe, !(*src)->vmap.is_iomem);
2890 
2891 	bo = xe_managed_bo_create_from_data(xe, tile, (*src)->vmap.vaddr,
2892 					    xe_bo_size(*src), dst_flags);
2893 	if (IS_ERR(bo))
2894 		return PTR_ERR(bo);
2895 
2896 	devm_release_action(xe->drm.dev, __xe_bo_unpin_map_no_vm, *src);
2897 	*src = bo;
2898 
2899 	return 0;
2900 }
2901 
2902 /*
2903  * XXX: This is in the VM bind data path, likely should calculate this once and
2904  * store, with a recalculation if the BO is moved.
2905  */
2906 uint64_t vram_region_gpu_offset(struct ttm_resource *res)
2907 {
2908 	struct xe_device *xe = ttm_to_xe_device(res->bo->bdev);
2909 
2910 	switch (res->mem_type) {
2911 	case XE_PL_STOLEN:
2912 		return xe_ttm_stolen_gpu_offset(xe);
2913 	case XE_PL_TT:
2914 	case XE_PL_SYSTEM:
2915 		return 0;
2916 	default:
2917 		return xe_map_resource_to_region(res)->dpa_base;
2918 	}
2919 	return 0;
2920 }
2921 
2922 /**
2923  * xe_bo_pin_external - pin an external BO
2924  * @bo: buffer object to be pinned
2925  * @in_place: Pin in current placement, don't attempt to migrate.
2926  * @exec: The drm_exec transaction to use for exhaustive eviction.
2927  *
2928  * Pin an external (not tied to a VM, can be exported via dma-buf / prime FD)
2929  * BO. Unique call compared to xe_bo_pin as this function has it own set of
2930  * asserts and code to ensure evict / restore on suspend / resume.
2931  *
2932  * Returns 0 for success, negative error code otherwise.
2933  */
2934 int xe_bo_pin_external(struct xe_bo *bo, bool in_place, struct drm_exec *exec)
2935 {
2936 	struct xe_device *xe = xe_bo_device(bo);
2937 	int err;
2938 
2939 	xe_assert(xe, !bo->vm);
2940 	xe_assert(xe, xe_bo_is_user(bo));
2941 
2942 	if (!xe_bo_is_pinned(bo)) {
2943 		if (!in_place) {
2944 			err = xe_bo_validate(bo, NULL, false, exec);
2945 			if (err)
2946 				return err;
2947 		}
2948 
2949 		spin_lock(&xe->pinned.lock);
2950 		list_add_tail(&bo->pinned_link, &xe->pinned.late.external);
2951 		spin_unlock(&xe->pinned.lock);
2952 	}
2953 
2954 	ttm_bo_pin(&bo->ttm);
2955 	if (bo->ttm.ttm && ttm_tt_is_populated(bo->ttm.ttm))
2956 		xe_ttm_tt_account_subtract(xe, bo->ttm.ttm);
2957 
2958 	/*
2959 	 * FIXME: If we always use the reserve / unreserve functions for locking
2960 	 * we do not need this.
2961 	 */
2962 	ttm_bo_move_to_lru_tail_unlocked(&bo->ttm);
2963 
2964 	return 0;
2965 }
2966 
2967 /**
2968  * xe_bo_pin() - Pin a kernel bo after potentially migrating it
2969  * @bo: The kernel bo to pin.
2970  * @exec: The drm_exec transaction to use for exhaustive eviction.
2971  *
2972  * Attempts to migrate a bo to @bo->placement. If that succeeds,
2973  * pins the bo.
2974  *
2975  * Return: %0 on success, negative error code on migration failure.
2976  */
2977 int xe_bo_pin(struct xe_bo *bo, struct drm_exec *exec)
2978 {
2979 	struct ttm_place *place = &bo->placements[0];
2980 	struct xe_device *xe = xe_bo_device(bo);
2981 	int err;
2982 
2983 	/* We currently don't expect user BO to be pinned */
2984 	xe_assert(xe, !xe_bo_is_user(bo));
2985 
2986 	/* Pinned object must be in GGTT or have pinned flag */
2987 	xe_assert(xe, bo->flags & (XE_BO_FLAG_PINNED |
2988 				   XE_BO_FLAG_GGTT));
2989 
2990 	/*
2991 	 * No reason we can't support pinning imported dma-bufs we just don't
2992 	 * expect to pin an imported dma-buf.
2993 	 */
2994 	xe_assert(xe, !bo->ttm.base.import_attach);
2995 
2996 	/* We only expect at most 1 pin */
2997 	xe_assert(xe, !xe_bo_is_pinned(bo));
2998 
2999 	err = xe_bo_validate(bo, NULL, false, exec);
3000 	if (err)
3001 		return err;
3002 
3003 	if (mem_type_is_vram(place->mem_type) || bo->flags & XE_BO_FLAG_GGTT) {
3004 		spin_lock(&xe->pinned.lock);
3005 		if (bo->flags & XE_BO_FLAG_PINNED_LATE_RESTORE)
3006 			list_add_tail(&bo->pinned_link, &xe->pinned.late.kernel_bo_present);
3007 		else
3008 			list_add_tail(&bo->pinned_link, &xe->pinned.early.kernel_bo_present);
3009 		spin_unlock(&xe->pinned.lock);
3010 	}
3011 
3012 	ttm_bo_pin(&bo->ttm);
3013 	if (bo->ttm.ttm && ttm_tt_is_populated(bo->ttm.ttm))
3014 		xe_ttm_tt_account_subtract(xe, bo->ttm.ttm);
3015 
3016 	/*
3017 	 * FIXME: If we always use the reserve / unreserve functions for locking
3018 	 * we do not need this.
3019 	 */
3020 	ttm_bo_move_to_lru_tail_unlocked(&bo->ttm);
3021 
3022 	return 0;
3023 }
3024 
3025 /**
3026  * xe_bo_unpin_external - unpin an external BO
3027  * @bo: buffer object to be unpinned
3028  *
3029  * Unpin an external (not tied to a VM, can be exported via dma-buf / prime FD)
3030  * BO. Unique call compared to xe_bo_unpin as this function has it own set of
3031  * asserts and code to ensure evict / restore on suspend / resume.
3032  *
3033  * Returns 0 for success, negative error code otherwise.
3034  */
3035 void xe_bo_unpin_external(struct xe_bo *bo)
3036 {
3037 	struct xe_device *xe = xe_bo_device(bo);
3038 
3039 	xe_assert(xe, !bo->vm);
3040 	xe_assert(xe, xe_bo_is_pinned(bo));
3041 	xe_assert(xe, xe_bo_is_user(bo));
3042 
3043 	spin_lock(&xe->pinned.lock);
3044 	if (bo->ttm.pin_count == 1 && !list_empty(&bo->pinned_link))
3045 		list_del_init(&bo->pinned_link);
3046 	spin_unlock(&xe->pinned.lock);
3047 
3048 	ttm_bo_unpin(&bo->ttm);
3049 	if (bo->ttm.ttm && ttm_tt_is_populated(bo->ttm.ttm))
3050 		xe_ttm_tt_account_add(xe, bo->ttm.ttm);
3051 
3052 	/*
3053 	 * FIXME: If we always use the reserve / unreserve functions for locking
3054 	 * we do not need this.
3055 	 */
3056 	ttm_bo_move_to_lru_tail_unlocked(&bo->ttm);
3057 }
3058 
3059 void xe_bo_unpin(struct xe_bo *bo)
3060 {
3061 	struct ttm_place *place = &bo->placements[0];
3062 	struct xe_device *xe = xe_bo_device(bo);
3063 
3064 	xe_assert(xe, !bo->ttm.base.import_attach);
3065 	xe_assert(xe, xe_bo_is_pinned(bo));
3066 
3067 	if (mem_type_is_vram(place->mem_type) || bo->flags & XE_BO_FLAG_GGTT) {
3068 		spin_lock(&xe->pinned.lock);
3069 		xe_assert(xe, !list_empty(&bo->pinned_link));
3070 		list_del_init(&bo->pinned_link);
3071 		spin_unlock(&xe->pinned.lock);
3072 
3073 		if (bo->backup_obj) {
3074 			if (xe_bo_is_pinned(bo->backup_obj))
3075 				ttm_bo_unpin(&bo->backup_obj->ttm);
3076 			xe_bo_put(bo->backup_obj);
3077 			bo->backup_obj = NULL;
3078 		}
3079 	}
3080 	ttm_bo_unpin(&bo->ttm);
3081 	if (bo->ttm.ttm && ttm_tt_is_populated(bo->ttm.ttm))
3082 		xe_ttm_tt_account_add(xe, bo->ttm.ttm);
3083 }
3084 
3085 /**
3086  * xe_bo_validate() - Make sure the bo is in an allowed placement
3087  * @bo: The bo,
3088  * @vm: Pointer to a the vm the bo shares a locked dma_resv object with, or
3089  *      NULL. Used together with @allow_res_evict.
3090  * @allow_res_evict: Whether it's allowed to evict bos sharing @vm's
3091  *                   reservation object.
3092  * @exec: The drm_exec transaction to use for exhaustive eviction.
3093  *
3094  * Make sure the bo is in allowed placement, migrating it if necessary. If
3095  * needed, other bos will be evicted. If bos selected for eviction shares
3096  * the @vm's reservation object, they can be evicted iff @allow_res_evict is
3097  * set to true, otherwise they will be bypassed.
3098  *
3099  * Return: 0 on success, negative error code on failure. May return
3100  * -EINTR or -ERESTARTSYS if internal waits are interrupted by a signal.
3101  */
3102 int xe_bo_validate(struct xe_bo *bo, struct xe_vm *vm, bool allow_res_evict,
3103 		   struct drm_exec *exec)
3104 {
3105 	struct ttm_operation_ctx ctx = {
3106 		.interruptible = true,
3107 		.no_wait_gpu = false,
3108 		.gfp_retry_mayfail = true,
3109 	};
3110 	int ret;
3111 
3112 	if (xe_bo_is_pinned(bo))
3113 		return 0;
3114 
3115 	if (vm) {
3116 		lockdep_assert_held(&vm->lock);
3117 		xe_vm_assert_held(vm);
3118 
3119 		ctx.allow_res_evict = allow_res_evict;
3120 		ctx.resv = xe_vm_resv(vm);
3121 	}
3122 
3123 	xe_vm_set_validating(vm, allow_res_evict);
3124 	trace_xe_bo_validate(bo);
3125 	xe_validation_assert_exec(xe_bo_device(bo), exec, &bo->ttm.base);
3126 	ret = ttm_bo_validate(&bo->ttm, &bo->placement, &ctx);
3127 	xe_vm_clear_validating(vm, allow_res_evict);
3128 
3129 	return ret;
3130 }
3131 
3132 bool xe_bo_is_xe_bo(struct ttm_buffer_object *bo)
3133 {
3134 	if (bo->destroy == &xe_ttm_bo_destroy)
3135 		return true;
3136 
3137 	return false;
3138 }
3139 
3140 /*
3141  * Resolve a BO address. There is no assert to check if the proper lock is held
3142  * so it should only be used in cases where it is not fatal to get the wrong
3143  * address, such as printing debug information, but not in cases where memory is
3144  * written based on this result.
3145  */
3146 dma_addr_t __xe_bo_addr(struct xe_bo *bo, u64 offset, size_t page_size)
3147 {
3148 	struct xe_device *xe = xe_bo_device(bo);
3149 	struct xe_res_cursor cur;
3150 	u64 page;
3151 
3152 	xe_assert(xe, page_size <= PAGE_SIZE);
3153 	page = offset >> PAGE_SHIFT;
3154 	offset &= (PAGE_SIZE - 1);
3155 
3156 	if (!xe_bo_is_vram(bo) && !xe_bo_is_stolen(bo)) {
3157 		xe_assert(xe, bo->ttm.ttm);
3158 
3159 		xe_res_first_sg(xe_bo_sg(bo), page << PAGE_SHIFT,
3160 				page_size, &cur);
3161 		return xe_res_dma(&cur) + offset;
3162 	} else {
3163 		struct xe_res_cursor cur;
3164 
3165 		xe_res_first(bo->ttm.resource, page << PAGE_SHIFT,
3166 			     page_size, &cur);
3167 		return cur.start + offset + vram_region_gpu_offset(bo->ttm.resource);
3168 	}
3169 }
3170 
3171 dma_addr_t xe_bo_addr(struct xe_bo *bo, u64 offset, size_t page_size)
3172 {
3173 	if (!READ_ONCE(bo->ttm.pin_count))
3174 		xe_bo_assert_held(bo);
3175 	return __xe_bo_addr(bo, offset, page_size);
3176 }
3177 
3178 int xe_bo_vmap(struct xe_bo *bo)
3179 {
3180 	struct xe_device *xe = ttm_to_xe_device(bo->ttm.bdev);
3181 	void *virtual;
3182 	bool is_iomem;
3183 	int ret;
3184 
3185 	xe_bo_assert_held(bo);
3186 
3187 	if (drm_WARN_ON(&xe->drm, !(bo->flags & XE_BO_FLAG_NEEDS_CPU_ACCESS) ||
3188 			!force_contiguous(bo->flags)))
3189 		return -EINVAL;
3190 
3191 	if (!iosys_map_is_null(&bo->vmap))
3192 		return 0;
3193 
3194 	/*
3195 	 * We use this more or less deprecated interface for now since
3196 	 * ttm_bo_vmap() doesn't offer the optimization of kmapping
3197 	 * single page bos, which is done here.
3198 	 * TODO: Fix up ttm_bo_vmap to do that, or fix up ttm_bo_kmap
3199 	 * to use struct iosys_map.
3200 	 */
3201 	ret = ttm_bo_kmap(&bo->ttm, 0, xe_bo_size(bo) >> PAGE_SHIFT, &bo->kmap);
3202 	if (ret)
3203 		return ret;
3204 
3205 	virtual = ttm_kmap_obj_virtual(&bo->kmap, &is_iomem);
3206 	if (is_iomem)
3207 		iosys_map_set_vaddr_iomem(&bo->vmap, (void __iomem *)virtual);
3208 	else
3209 		iosys_map_set_vaddr(&bo->vmap, virtual);
3210 
3211 	return 0;
3212 }
3213 
3214 static void __xe_bo_vunmap(struct xe_bo *bo)
3215 {
3216 	if (!iosys_map_is_null(&bo->vmap)) {
3217 		iosys_map_clear(&bo->vmap);
3218 		ttm_bo_kunmap(&bo->kmap);
3219 	}
3220 }
3221 
3222 void xe_bo_vunmap(struct xe_bo *bo)
3223 {
3224 	xe_bo_assert_held(bo);
3225 	__xe_bo_vunmap(bo);
3226 }
3227 
3228 static int gem_create_set_pxp_type(struct xe_device *xe, struct xe_bo *bo, u64 value)
3229 {
3230 	if (value == DRM_XE_PXP_TYPE_NONE)
3231 		return 0;
3232 
3233 	/* we only support DRM_XE_PXP_TYPE_HWDRM for now */
3234 	if (XE_IOCTL_DBG(xe, value != DRM_XE_PXP_TYPE_HWDRM))
3235 		return -EINVAL;
3236 
3237 	return xe_pxp_key_assign(xe->pxp, bo);
3238 }
3239 
3240 typedef int (*xe_gem_create_set_property_fn)(struct xe_device *xe,
3241 					     struct xe_bo *bo,
3242 					     u64 value);
3243 
3244 static const xe_gem_create_set_property_fn gem_create_set_property_funcs[] = {
3245 	[DRM_XE_GEM_CREATE_SET_PROPERTY_PXP_TYPE] = gem_create_set_pxp_type,
3246 };
3247 
3248 static int gem_create_user_ext_set_property(struct xe_device *xe,
3249 					    struct xe_bo *bo,
3250 					    u64 extension)
3251 {
3252 	u64 __user *address = u64_to_user_ptr(extension);
3253 	struct drm_xe_ext_set_property ext;
3254 	int err;
3255 	u32 idx;
3256 
3257 	err = copy_from_user(&ext, address, sizeof(ext));
3258 	if (XE_IOCTL_DBG(xe, err))
3259 		return -EFAULT;
3260 
3261 	if (XE_IOCTL_DBG(xe, ext.property >=
3262 			 ARRAY_SIZE(gem_create_set_property_funcs)) ||
3263 	    XE_IOCTL_DBG(xe, ext.pad) ||
3264 	    XE_IOCTL_DBG(xe, ext.property != DRM_XE_GEM_CREATE_EXTENSION_SET_PROPERTY))
3265 		return -EINVAL;
3266 
3267 	idx = array_index_nospec(ext.property, ARRAY_SIZE(gem_create_set_property_funcs));
3268 	if (!gem_create_set_property_funcs[idx])
3269 		return -EINVAL;
3270 
3271 	return gem_create_set_property_funcs[idx](xe, bo, ext.value);
3272 }
3273 
3274 typedef int (*xe_gem_create_user_extension_fn)(struct xe_device *xe,
3275 					       struct xe_bo *bo,
3276 					       u64 extension);
3277 
3278 static const xe_gem_create_user_extension_fn gem_create_user_extension_funcs[] = {
3279 	[DRM_XE_GEM_CREATE_EXTENSION_SET_PROPERTY] = gem_create_user_ext_set_property,
3280 };
3281 
3282 #define MAX_USER_EXTENSIONS	16
3283 static int gem_create_user_extensions(struct xe_device *xe, struct xe_bo *bo,
3284 				      u64 extensions, int ext_number)
3285 {
3286 	u64 __user *address = u64_to_user_ptr(extensions);
3287 	struct drm_xe_user_extension ext;
3288 	int err;
3289 	u32 idx;
3290 
3291 	if (XE_IOCTL_DBG(xe, ext_number >= MAX_USER_EXTENSIONS))
3292 		return -E2BIG;
3293 
3294 	err = copy_from_user(&ext, address, sizeof(ext));
3295 	if (XE_IOCTL_DBG(xe, err))
3296 		return -EFAULT;
3297 
3298 	if (XE_IOCTL_DBG(xe, ext.pad) ||
3299 	    XE_IOCTL_DBG(xe, ext.name >= ARRAY_SIZE(gem_create_user_extension_funcs)))
3300 		return -EINVAL;
3301 
3302 	idx = array_index_nospec(ext.name,
3303 				 ARRAY_SIZE(gem_create_user_extension_funcs));
3304 	err = gem_create_user_extension_funcs[idx](xe, bo, extensions);
3305 	if (XE_IOCTL_DBG(xe, err))
3306 		return err;
3307 
3308 	if (ext.next_extension)
3309 		return gem_create_user_extensions(xe, bo, ext.next_extension,
3310 						  ++ext_number);
3311 
3312 	return 0;
3313 }
3314 
3315 int xe_gem_create_ioctl(struct drm_device *dev, void *data,
3316 			struct drm_file *file)
3317 {
3318 	struct xe_device *xe = to_xe_device(dev);
3319 	struct xe_file *xef = to_xe_file(file);
3320 	struct drm_xe_gem_create *args = data;
3321 	struct xe_validation_ctx ctx;
3322 	struct drm_exec exec;
3323 	struct xe_vm *vm = NULL;
3324 	struct xe_bo *bo;
3325 	unsigned int bo_flags;
3326 	u32 handle;
3327 	int err;
3328 
3329 	if (XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] || args->pad[2]) ||
3330 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
3331 		return -EINVAL;
3332 
3333 	/* at least one valid memory placement must be specified */
3334 	if (XE_IOCTL_DBG(xe, (args->placement & ~xe->info.mem_region_mask) ||
3335 			 !args->placement))
3336 		return -EINVAL;
3337 
3338 	if (XE_IOCTL_DBG(xe, args->flags &
3339 			 ~(DRM_XE_GEM_CREATE_FLAG_DEFER_BACKING |
3340 			   DRM_XE_GEM_CREATE_FLAG_SCANOUT |
3341 			   DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM |
3342 			   DRM_XE_GEM_CREATE_FLAG_NO_COMPRESSION)))
3343 		return -EINVAL;
3344 
3345 	if (XE_IOCTL_DBG(xe, args->handle))
3346 		return -EINVAL;
3347 
3348 	if (XE_IOCTL_DBG(xe, !args->size))
3349 		return -EINVAL;
3350 
3351 	if (XE_IOCTL_DBG(xe, args->size > SIZE_MAX))
3352 		return -EINVAL;
3353 
3354 	if (XE_IOCTL_DBG(xe, args->size & ~PAGE_MASK))
3355 		return -EINVAL;
3356 
3357 	bo_flags = 0;
3358 	if (args->flags & DRM_XE_GEM_CREATE_FLAG_DEFER_BACKING)
3359 		bo_flags |= XE_BO_FLAG_DEFER_BACKING;
3360 
3361 	/*
3362 	 * Display scanout is always non-coherent with the CPU cache.
3363 	 */
3364 	if (args->flags & DRM_XE_GEM_CREATE_FLAG_SCANOUT)
3365 		bo_flags |= XE_BO_FLAG_FORCE_WC;
3366 
3367 	if (args->flags & DRM_XE_GEM_CREATE_FLAG_NO_COMPRESSION) {
3368 		if (XE_IOCTL_DBG(xe, GRAPHICS_VER(xe) < 20))
3369 			return -EOPNOTSUPP;
3370 		bo_flags |= XE_BO_FLAG_NO_COMPRESSION;
3371 	}
3372 
3373 	bo_flags |= args->placement << (ffs(XE_BO_FLAG_SYSTEM) - 1);
3374 
3375 	/* CCS formats need physical placement at a 64K alignment in VRAM. */
3376 	if ((bo_flags & XE_BO_FLAG_VRAM_MASK) &&
3377 	    (args->flags & DRM_XE_GEM_CREATE_FLAG_SCANOUT) &&
3378 	    !(xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K) &&
3379 	    IS_ALIGNED(args->size, SZ_64K))
3380 		bo_flags |= XE_BO_FLAG_NEEDS_64K;
3381 
3382 	if (args->flags & DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM) {
3383 		if (XE_IOCTL_DBG(xe, !(bo_flags & XE_BO_FLAG_VRAM_MASK)))
3384 			return -EINVAL;
3385 
3386 		bo_flags |= XE_BO_FLAG_NEEDS_CPU_ACCESS;
3387 	}
3388 
3389 	if (XE_IOCTL_DBG(xe, !args->cpu_caching ||
3390 			 args->cpu_caching > DRM_XE_GEM_CPU_CACHING_WC))
3391 		return -EINVAL;
3392 
3393 	if (XE_IOCTL_DBG(xe, bo_flags & XE_BO_FLAG_VRAM_MASK &&
3394 			 args->cpu_caching != DRM_XE_GEM_CPU_CACHING_WC))
3395 		return -EINVAL;
3396 
3397 	if (XE_IOCTL_DBG(xe, bo_flags & XE_BO_FLAG_FORCE_WC &&
3398 			 args->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB))
3399 		return -EINVAL;
3400 
3401 	if (args->vm_id) {
3402 		vm = xe_vm_lookup(xef, args->vm_id);
3403 		if (XE_IOCTL_DBG(xe, !vm))
3404 			return -ENOENT;
3405 	}
3406 
3407 	err = 0;
3408 	xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {.interruptible = true},
3409 			    err) {
3410 		if (vm) {
3411 			err = xe_vm_drm_exec_lock(vm, &exec);
3412 			drm_exec_retry_on_contention(&exec);
3413 			if (err)
3414 				break;
3415 		}
3416 		bo = xe_bo_create_user(xe, vm, args->size, args->cpu_caching,
3417 				       bo_flags, &exec);
3418 		drm_exec_retry_on_contention(&exec);
3419 		if (IS_ERR(bo)) {
3420 			err = PTR_ERR(bo);
3421 			xe_validation_retry_on_oom(&ctx, &err);
3422 			break;
3423 		}
3424 	}
3425 	if (err)
3426 		goto out_vm;
3427 
3428 	if (args->extensions) {
3429 		err = gem_create_user_extensions(xe, bo, args->extensions, 0);
3430 		if (err)
3431 			goto out_bulk;
3432 	}
3433 
3434 	err = drm_gem_handle_create(file, &bo->ttm.base, &handle);
3435 	if (err)
3436 		goto out_bulk;
3437 
3438 	args->handle = handle;
3439 	goto out_put;
3440 
3441 out_bulk:
3442 	if (vm && !xe_vm_in_fault_mode(vm)) {
3443 		xe_vm_lock(vm, false);
3444 		__xe_bo_unset_bulk_move(bo);
3445 		xe_vm_unlock(vm);
3446 	}
3447 out_put:
3448 	xe_bo_put(bo);
3449 out_vm:
3450 	if (vm)
3451 		xe_vm_put(vm);
3452 
3453 	return err;
3454 }
3455 
3456 int xe_gem_mmap_offset_ioctl(struct drm_device *dev, void *data,
3457 			     struct drm_file *file)
3458 {
3459 	struct xe_device *xe = to_xe_device(dev);
3460 	struct drm_xe_gem_mmap_offset *args = data;
3461 	struct drm_gem_object *gem_obj;
3462 
3463 	if (XE_IOCTL_DBG(xe, args->extensions) ||
3464 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
3465 		return -EINVAL;
3466 
3467 	if (XE_IOCTL_DBG(xe, args->flags &
3468 			 ~DRM_XE_MMAP_OFFSET_FLAG_PCI_BARRIER))
3469 		return -EINVAL;
3470 
3471 	if (args->flags & DRM_XE_MMAP_OFFSET_FLAG_PCI_BARRIER) {
3472 		if (XE_IOCTL_DBG(xe, !IS_DGFX(xe)))
3473 			return -EINVAL;
3474 
3475 		if (XE_IOCTL_DBG(xe, args->handle))
3476 			return -EINVAL;
3477 
3478 		if (XE_IOCTL_DBG(xe, PAGE_SIZE > SZ_4K))
3479 			return -EINVAL;
3480 
3481 		BUILD_BUG_ON(((XE_PCI_BARRIER_MMAP_OFFSET >> XE_PTE_SHIFT) +
3482 			      SZ_4K) >= DRM_FILE_PAGE_OFFSET_START);
3483 		args->offset = XE_PCI_BARRIER_MMAP_OFFSET;
3484 		return 0;
3485 	}
3486 
3487 	gem_obj = drm_gem_object_lookup(file, args->handle);
3488 	if (XE_IOCTL_DBG(xe, !gem_obj))
3489 		return -ENOENT;
3490 
3491 	/* The mmap offset was set up at BO allocation time. */
3492 	args->offset = drm_vma_node_offset_addr(&gem_obj->vma_node);
3493 
3494 	xe_bo_put(gem_to_xe_bo(gem_obj));
3495 	return 0;
3496 }
3497 
3498 /**
3499  * xe_bo_decompress - schedule in-place decompress and install fence
3500  * @bo: buffer object (caller should hold drm_exec reservations for VM+BO)
3501  *
3502  * Schedules an in-place resolve via the migrate layer and installs the
3503  * returned dma_fence into the BO kernel reservation slot (DMA_RESV_USAGE_KERNEL).
3504  * In preempt fence mode, this operation interrupts hardware execution
3505  * which is expensive. Page fault mode is recommended for better performance.
3506  *
3507  * The resolve path only runs for VRAM-backed buffers (currently dGPU-only);
3508  * iGPU/system-memory objects fail the resource check and bypass the resolve.
3509  *
3510  * Returns 0 on success, negative errno on error.
3511  */
3512 int xe_bo_decompress(struct xe_bo *bo)
3513 {
3514 	struct xe_device *xe = xe_bo_device(bo);
3515 	struct xe_tile *tile = xe_device_get_root_tile(xe);
3516 	struct dma_fence *decomp_fence = NULL;
3517 	struct ttm_operation_ctx op_ctx = {
3518 		.interruptible = true,
3519 		.no_wait_gpu = false,
3520 		.gfp_retry_mayfail = false,
3521 	};
3522 	int err = 0;
3523 
3524 	/* Silently skip decompression for non-VRAM buffers */
3525 	if (!bo->ttm.resource || !mem_type_is_vram(bo->ttm.resource->mem_type))
3526 		return 0;
3527 
3528 	/* Notify before scheduling resolve */
3529 	err = xe_bo_move_notify(bo, &op_ctx);
3530 	if (err)
3531 		return err;
3532 
3533 	/* Reserve fence slot before scheduling */
3534 	err = dma_resv_reserve_fences(bo->ttm.base.resv, 1);
3535 	if (err)
3536 		return err;
3537 
3538 	/* Schedule the in-place decompression */
3539 	decomp_fence = xe_migrate_resolve(tile->migrate,
3540 					  bo,
3541 					  bo->ttm.resource);
3542 
3543 	if (IS_ERR(decomp_fence))
3544 		return PTR_ERR(decomp_fence);
3545 
3546 	/* Install kernel-usage fence */
3547 	dma_resv_add_fence(bo->ttm.base.resv, decomp_fence, DMA_RESV_USAGE_KERNEL);
3548 	dma_fence_put(decomp_fence);
3549 
3550 	return 0;
3551 }
3552 
3553 /**
3554  * xe_bo_lock() - Lock the buffer object's dma_resv object
3555  * @bo: The struct xe_bo whose lock is to be taken
3556  * @intr: Whether to perform any wait interruptible
3557  *
3558  * Locks the buffer object's dma_resv object. If the buffer object is
3559  * pointing to a shared dma_resv object, that shared lock is locked.
3560  *
3561  * Return: 0 on success, -EINTR if @intr is true and the wait for a
3562  * contended lock was interrupted. If @intr is set to false, the
3563  * function always returns 0.
3564  */
3565 int xe_bo_lock(struct xe_bo *bo, bool intr)
3566 {
3567 	if (intr)
3568 		return dma_resv_lock_interruptible(bo->ttm.base.resv, NULL);
3569 
3570 	dma_resv_lock(bo->ttm.base.resv, NULL);
3571 
3572 	return 0;
3573 }
3574 
3575 /**
3576  * xe_bo_unlock() - Unlock the buffer object's dma_resv object
3577  * @bo: The struct xe_bo whose lock is to be released.
3578  *
3579  * Unlock a buffer object lock that was locked by xe_bo_lock().
3580  */
3581 void xe_bo_unlock(struct xe_bo *bo)
3582 {
3583 	dma_resv_unlock(bo->ttm.base.resv);
3584 }
3585 
3586 /**
3587  * xe_bo_can_migrate - Whether a buffer object likely can be migrated
3588  * @bo: The buffer object to migrate
3589  * @mem_type: The TTM memory type intended to migrate to
3590  *
3591  * Check whether the buffer object supports migration to the
3592  * given memory type. Note that pinning may affect the ability to migrate as
3593  * returned by this function.
3594  *
3595  * This function is primarily intended as a helper for checking the
3596  * possibility to migrate buffer objects and can be called without
3597  * the object lock held.
3598  *
3599  * Return: true if migration is possible, false otherwise.
3600  */
3601 bool xe_bo_can_migrate(struct xe_bo *bo, u32 mem_type)
3602 {
3603 	unsigned int cur_place;
3604 
3605 	if (bo->ttm.type == ttm_bo_type_kernel)
3606 		return true;
3607 
3608 	if (bo->ttm.type == ttm_bo_type_sg)
3609 		return false;
3610 
3611 	for (cur_place = 0; cur_place < bo->placement.num_placement;
3612 	     cur_place++) {
3613 		if (bo->placements[cur_place].mem_type == mem_type)
3614 			return true;
3615 	}
3616 
3617 	return false;
3618 }
3619 
3620 static void xe_place_from_ttm_type(u32 mem_type, struct ttm_place *place)
3621 {
3622 	memset(place, 0, sizeof(*place));
3623 	place->mem_type = mem_type;
3624 }
3625 
3626 /**
3627  * xe_bo_migrate - Migrate an object to the desired region id
3628  * @bo: The buffer object to migrate.
3629  * @mem_type: The TTM region type to migrate to.
3630  * @tctx: A pointer to a struct ttm_operation_ctx or NULL if
3631  * a default interruptibe ctx is to be used.
3632  * @exec: The drm_exec transaction to use for exhaustive eviction.
3633  *
3634  * Attempt to migrate the buffer object to the desired memory region. The
3635  * buffer object may not be pinned, and must be locked.
3636  * On successful completion, the object memory type will be updated,
3637  * but an async migration task may not have completed yet, and to
3638  * accomplish that, the object's kernel fences must be signaled with
3639  * the object lock held.
3640  *
3641  * Return: 0 on success. Negative error code on failure. In particular may
3642  * return -EINTR or -ERESTARTSYS if signal pending.
3643  */
3644 int xe_bo_migrate(struct xe_bo *bo, u32 mem_type, struct ttm_operation_ctx *tctx,
3645 		  struct drm_exec *exec)
3646 {
3647 	struct xe_device *xe = ttm_to_xe_device(bo->ttm.bdev);
3648 	struct ttm_operation_ctx ctx = {
3649 		.interruptible = true,
3650 		.no_wait_gpu = false,
3651 		.gfp_retry_mayfail = true,
3652 	};
3653 	struct ttm_placement placement;
3654 	struct ttm_place requested;
3655 
3656 	xe_bo_assert_held(bo);
3657 	tctx = tctx ? tctx : &ctx;
3658 
3659 	if (bo->ttm.resource->mem_type == mem_type)
3660 		return 0;
3661 
3662 	if (xe_bo_is_pinned(bo))
3663 		return -EBUSY;
3664 
3665 	if (!xe_bo_can_migrate(bo, mem_type))
3666 		return -EINVAL;
3667 
3668 	xe_place_from_ttm_type(mem_type, &requested);
3669 	placement.num_placement = 1;
3670 	placement.placement = &requested;
3671 
3672 	/*
3673 	 * Stolen needs to be handled like below VRAM handling if we ever need
3674 	 * to support it.
3675 	 */
3676 	drm_WARN_ON(&xe->drm, mem_type == XE_PL_STOLEN);
3677 
3678 	if (mem_type_is_vram(mem_type)) {
3679 		u32 c = 0;
3680 
3681 		add_vram(xe, bo, &requested, bo->flags, mem_type, &c);
3682 	}
3683 
3684 	if (!tctx->no_wait_gpu)
3685 		xe_validation_assert_exec(xe_bo_device(bo), exec, &bo->ttm.base);
3686 	return ttm_bo_validate(&bo->ttm, &placement, tctx);
3687 }
3688 
3689 /**
3690  * xe_bo_evict - Evict an object to evict placement
3691  * @bo: The buffer object to migrate.
3692  * @exec: The drm_exec transaction to use for exhaustive eviction.
3693  *
3694  * On successful completion, the object memory will be moved to evict
3695  * placement. This function blocks until the object has been fully moved.
3696  *
3697  * Return: 0 on success. Negative error code on failure.
3698  */
3699 int xe_bo_evict(struct xe_bo *bo, struct drm_exec *exec)
3700 {
3701 	struct ttm_operation_ctx ctx = {
3702 		.interruptible = false,
3703 		.no_wait_gpu = false,
3704 		.gfp_retry_mayfail = true,
3705 	};
3706 	struct ttm_placement placement;
3707 	int ret;
3708 
3709 	xe_evict_flags(&bo->ttm, &placement);
3710 	ret = ttm_bo_validate(&bo->ttm, &placement, &ctx);
3711 	if (ret)
3712 		return ret;
3713 
3714 	dma_resv_wait_timeout(bo->ttm.base.resv, DMA_RESV_USAGE_KERNEL,
3715 			      false, MAX_SCHEDULE_TIMEOUT);
3716 
3717 	return 0;
3718 }
3719 
3720 /**
3721  * xe_bo_needs_ccs_pages - Whether a bo needs to back up CCS pages when
3722  * placed in system memory.
3723  * @bo: The xe_bo
3724  *
3725  * Return: true if extra pages need to be allocated, false otherwise.
3726  */
3727 bool xe_bo_needs_ccs_pages(struct xe_bo *bo)
3728 {
3729 	struct xe_device *xe = xe_bo_device(bo);
3730 
3731 	if (GRAPHICS_VER(xe) >= 20 && IS_DGFX(xe))
3732 		return false;
3733 
3734 	if (!xe_device_has_flat_ccs(xe) || bo->ttm.type != ttm_bo_type_device)
3735 		return false;
3736 
3737 	/* On discrete GPUs, if the GPU can access this buffer from
3738 	 * system memory (i.e., it allows XE_PL_TT placement), FlatCCS
3739 	 * can't be used since there's no CCS storage associated with
3740 	 * non-VRAM addresses.
3741 	 */
3742 	if (IS_DGFX(xe) && (bo->flags & XE_BO_FLAG_SYSTEM))
3743 		return false;
3744 
3745 	/* Check if userspace explicitly requested no compression */
3746 	if (bo->flags & XE_BO_FLAG_NO_COMPRESSION)
3747 		return false;
3748 
3749 	/*
3750 	 * For WB (Write-Back) CPU caching mode, check if the device
3751 	 * supports WB compression with coherency.
3752 	 */
3753 	if (bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB &&
3754 	    xe->pat.idx[XE_CACHE_WB_COMPRESSION] == XE_PAT_INVALID_IDX)
3755 		return false;
3756 
3757 	return true;
3758 }
3759 
3760 /**
3761  * __xe_bo_release_dummy() - Dummy kref release function
3762  * @kref: The embedded struct kref.
3763  *
3764  * Dummy release function for xe_bo_put_deferred(). Keep off.
3765  */
3766 void __xe_bo_release_dummy(struct kref *kref)
3767 {
3768 }
3769 
3770 /**
3771  * xe_bo_put_commit() - Put bos whose put was deferred by xe_bo_put_deferred().
3772  * @deferred: The lockless list used for the call to xe_bo_put_deferred().
3773  *
3774  * Puts all bos whose put was deferred by xe_bo_put_deferred().
3775  * The @deferred list can be either an onstack local list or a global
3776  * shared list used by a workqueue.
3777  */
3778 void xe_bo_put_commit(struct llist_head *deferred)
3779 {
3780 	struct llist_node *freed;
3781 	struct xe_bo *bo, *next;
3782 
3783 	if (!deferred)
3784 		return;
3785 
3786 	freed = llist_del_all(deferred);
3787 	if (!freed)
3788 		return;
3789 
3790 	llist_for_each_entry_safe(bo, next, freed, freed)
3791 		drm_gem_object_free(&bo->ttm.base.refcount);
3792 }
3793 
3794 static void xe_bo_dev_work_func(struct work_struct *work)
3795 {
3796 	struct xe_bo_dev *bo_dev = container_of(work, typeof(*bo_dev), async_free);
3797 
3798 	xe_bo_put_commit(&bo_dev->async_list);
3799 }
3800 
3801 /**
3802  * xe_bo_dev_init() - Initialize BO dev to manage async BO freeing
3803  * @bo_dev: The BO dev structure
3804  */
3805 void xe_bo_dev_init(struct xe_bo_dev *bo_dev)
3806 {
3807 	INIT_WORK(&bo_dev->async_free, xe_bo_dev_work_func);
3808 }
3809 
3810 /**
3811  * xe_bo_dev_fini() - Finalize BO dev managing async BO freeing
3812  * @bo_dev: The BO dev structure
3813  */
3814 void xe_bo_dev_fini(struct xe_bo_dev *bo_dev)
3815 {
3816 	flush_work(&bo_dev->async_free);
3817 }
3818 
3819 void xe_bo_put(struct xe_bo *bo)
3820 {
3821 	struct xe_tile *tile;
3822 	u8 id;
3823 
3824 	might_sleep();
3825 	if (bo) {
3826 #ifdef CONFIG_PROC_FS
3827 		if (bo->client)
3828 			might_lock(&bo->client->bos_lock);
3829 #endif
3830 		for_each_tile(tile, xe_bo_device(bo), id)
3831 			if (bo->ggtt_node[id])
3832 				xe_ggtt_might_lock(tile->mem.ggtt);
3833 		drm_gem_object_put(&bo->ttm.base);
3834 	}
3835 }
3836 
3837 /**
3838  * xe_bo_dumb_create - Create a dumb bo as backing for a fb
3839  * @file_priv: ...
3840  * @dev: ...
3841  * @args: ...
3842  *
3843  * See dumb_create() hook in include/drm/drm_drv.h
3844  *
3845  * Return: ...
3846  */
3847 int xe_bo_dumb_create(struct drm_file *file_priv,
3848 		      struct drm_device *dev,
3849 		      struct drm_mode_create_dumb *args)
3850 {
3851 	struct xe_device *xe = to_xe_device(dev);
3852 	struct xe_bo *bo;
3853 	uint32_t handle;
3854 	int err;
3855 	u32 page_size = max_t(u32, PAGE_SIZE,
3856 		xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K ? SZ_64K : SZ_4K);
3857 
3858 	err = drm_mode_size_dumb(dev, args, SZ_64, page_size);
3859 	if (err)
3860 		return err;
3861 
3862 	bo = xe_bo_create_user(xe, NULL, args->size,
3863 			       DRM_XE_GEM_CPU_CACHING_WC,
3864 			       XE_BO_FLAG_VRAM_IF_DGFX(xe_device_get_root_tile(xe)) |
3865 			       XE_BO_FLAG_FORCE_WC |
3866 			       XE_BO_FLAG_NEEDS_CPU_ACCESS, NULL);
3867 	if (IS_ERR(bo))
3868 		return PTR_ERR(bo);
3869 
3870 	err = drm_gem_handle_create(file_priv, &bo->ttm.base, &handle);
3871 	/* drop reference from allocate - handle holds it now */
3872 	drm_gem_object_put(&bo->ttm.base);
3873 	if (!err)
3874 		args->handle = handle;
3875 	return err;
3876 }
3877 
3878 void xe_bo_runtime_pm_release_mmap_offset(struct xe_bo *bo)
3879 {
3880 	struct ttm_buffer_object *tbo = &bo->ttm;
3881 	struct ttm_device *bdev = tbo->bdev;
3882 
3883 	drm_vma_node_unmap(&tbo->base.vma_node, bdev->dev_mapping);
3884 
3885 	list_del_init(&bo->vram_userfault_link);
3886 }
3887 
3888 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
3889 #include "tests/xe_bo.c"
3890 #endif
3891