xref: /linux/drivers/gpu/drm/xe/xe_migrate.c (revision 82f78acd5a9270370ef4aa3f032ede25f3dc91ee)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2020 Intel Corporation
4  */
5 
6 #include "xe_migrate.h"
7 
8 #include <linux/bitfield.h>
9 #include <linux/sizes.h>
10 
11 #include <drm/drm_managed.h>
12 #include <drm/drm_pagemap.h>
13 #include <drm/ttm/ttm_tt.h>
14 #include <uapi/drm/xe_drm.h>
15 
16 #include <generated/xe_wa_oob.h>
17 
18 #include "instructions/xe_gpu_commands.h"
19 #include "instructions/xe_mi_commands.h"
20 #include "regs/xe_gtt_defs.h"
21 #include "tests/xe_test.h"
22 #include "xe_assert.h"
23 #include "xe_bb.h"
24 #include "xe_bo.h"
25 #include "xe_exec_queue.h"
26 #include "xe_ggtt.h"
27 #include "xe_gt.h"
28 #include "xe_hw_engine.h"
29 #include "xe_lrc.h"
30 #include "xe_map.h"
31 #include "xe_mocs.h"
32 #include "xe_printk.h"
33 #include "xe_pt.h"
34 #include "xe_res_cursor.h"
35 #include "xe_sa.h"
36 #include "xe_sched_job.h"
37 #include "xe_sync.h"
38 #include "xe_trace_bo.h"
39 #include "xe_validation.h"
40 #include "xe_vm.h"
41 #include "xe_vram.h"
42 
43 /**
44  * struct xe_migrate - migrate context.
45  */
46 struct xe_migrate {
47 	/** @q: Default exec queue used for migration */
48 	struct xe_exec_queue *q;
49 	/** @tile: Backpointer to the tile this struct xe_migrate belongs to. */
50 	struct xe_tile *tile;
51 	/** @job_mutex: Timeline mutex for @eng. */
52 	struct mutex job_mutex;
53 	/** @pt_bo: Page-table buffer object. */
54 	struct xe_bo *pt_bo;
55 	/** @batch_base_ofs: VM offset of the migration batch buffer */
56 	u64 batch_base_ofs;
57 	/** @usm_batch_base_ofs: VM offset of the usm batch buffer */
58 	u64 usm_batch_base_ofs;
59 	/** @cleared_mem_ofs: VM offset of @cleared_bo. */
60 	u64 cleared_mem_ofs;
61 	/** @large_page_copy_ofs: VM offset of 2M pages used for large copies */
62 	u64 large_page_copy_ofs;
63 	/**
64 	 * @large_page_copy_pdes: BO offset to writeout 2M pages (PDEs) used for
65 	 * large copies
66 	 */
67 	u64 large_page_copy_pdes;
68 	/**
69 	 * @fence: dma-fence representing the last migration job batch.
70 	 * Protected by @job_mutex.
71 	 */
72 	struct dma_fence *fence;
73 	/**
74 	 * @vm_update_sa: For integrated, used to suballocate page-tables
75 	 * out of the pt_bo.
76 	 */
77 	struct drm_suballoc_manager vm_update_sa;
78 	/** @min_chunk_size: For dgfx, Minimum chunk size */
79 	u64 min_chunk_size;
80 };
81 
82 #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */
83 #define MAX_CCS_LIMITED_TRANSFER SZ_4M /* XE_PAGE_SIZE * (FIELD_MAX(XE2_CCS_SIZE_MASK) + 1) */
84 #define NUM_KERNEL_PDE 15
85 #define NUM_PT_SLOTS 32
86 #define LEVEL0_PAGE_TABLE_ENCODE_SIZE SZ_2M
87 #define MAX_NUM_PTE 512
88 #define IDENTITY_OFFSET 256ULL
89 
90 /*
91  * Although MI_STORE_DATA_IMM's "length" field is 10-bits, 0x3FE is the largest
92  * legal value accepted.  Since that instruction field is always stored in
93  * (val-2) format, this translates to 0x400 dwords for the true maximum length
94  * of the instruction.  Subtracting the instruction header (1 dword) and
95  * address (2 dwords), that leaves 0x3FD dwords (0x1FE qwords) for PTE values.
96  */
97 #define MAX_PTE_PER_SDI 0x1FEU
98 
xe_migrate_fini(void * arg)99 static void xe_migrate_fini(void *arg)
100 {
101 	struct xe_migrate *m = arg;
102 
103 	xe_vm_lock(m->q->vm, false);
104 	xe_bo_unpin(m->pt_bo);
105 	xe_vm_unlock(m->q->vm);
106 
107 	dma_fence_put(m->fence);
108 	xe_bo_put(m->pt_bo);
109 	drm_suballoc_manager_fini(&m->vm_update_sa);
110 	mutex_destroy(&m->job_mutex);
111 	xe_vm_close_and_put(m->q->vm);
112 	xe_exec_queue_put(m->q);
113 }
114 
xe_migrate_vm_addr(u64 slot,u32 level)115 static u64 xe_migrate_vm_addr(u64 slot, u32 level)
116 {
117 	XE_WARN_ON(slot >= NUM_PT_SLOTS);
118 
119 	/* First slot is reserved for mapping of PT bo and bb, start from 1 */
120 	return (slot + 1ULL) << xe_pt_shift(level + 1);
121 }
122 
xe_migrate_vram_ofs(struct xe_device * xe,u64 addr,bool is_comp_pte)123 static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr, bool is_comp_pte)
124 {
125 	/*
126 	 * Remove the DPA to get a correct offset into identity table for the
127 	 * migrate offset
128 	 */
129 	u64 identity_offset = IDENTITY_OFFSET;
130 
131 	if (GRAPHICS_VER(xe) >= 20 && is_comp_pte)
132 		identity_offset += DIV_ROUND_UP_ULL(xe_vram_region_actual_physical_size
133 							(xe->mem.vram), SZ_1G);
134 
135 	addr -= xe_vram_region_dpa_base(xe->mem.vram);
136 	return addr + (identity_offset << xe_pt_shift(2));
137 }
138 
xe_migrate_program_identity(struct xe_device * xe,struct xe_vm * vm,struct xe_bo * bo,u64 map_ofs,u64 vram_offset,u16 pat_index,u64 pt_2m_ofs)139 static void xe_migrate_program_identity(struct xe_device *xe, struct xe_vm *vm, struct xe_bo *bo,
140 					u64 map_ofs, u64 vram_offset, u16 pat_index, u64 pt_2m_ofs)
141 {
142 	struct xe_vram_region *vram = xe->mem.vram;
143 	resource_size_t dpa_base = xe_vram_region_dpa_base(vram);
144 	u64 pos, ofs, flags;
145 	u64 entry;
146 	/* XXX: Unclear if this should be usable_size? */
147 	u64 vram_limit = xe_vram_region_actual_physical_size(vram) + dpa_base;
148 	u32 level = 2;
149 
150 	ofs = map_ofs + XE_PAGE_SIZE * level + vram_offset * 8;
151 	flags = vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level,
152 					    true, 0);
153 
154 	xe_assert(xe, IS_ALIGNED(xe_vram_region_usable_size(vram), SZ_2M));
155 
156 	/*
157 	 * Use 1GB pages when possible, last chunk always use 2M
158 	 * pages as mixing reserved memory (stolen, WOCPM) with a single
159 	 * mapping is not allowed on certain platforms.
160 	 */
161 	for (pos = dpa_base; pos < vram_limit;
162 	     pos += SZ_1G, ofs += 8) {
163 		if (pos + SZ_1G >= vram_limit) {
164 			entry = vm->pt_ops->pde_encode_bo(bo, pt_2m_ofs);
165 			xe_map_wr(xe, &bo->vmap, ofs, u64, entry);
166 
167 			flags = vm->pt_ops->pte_encode_addr(xe, 0,
168 							    pat_index,
169 							    level - 1,
170 							    true, 0);
171 
172 			for (ofs = pt_2m_ofs; pos < vram_limit;
173 			     pos += SZ_2M, ofs += 8)
174 				xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags);
175 			break;	/* Ensure pos == vram_limit assert correct */
176 		}
177 
178 		xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags);
179 	}
180 
181 	xe_assert(xe, pos == vram_limit);
182 }
183 
xe_migrate_prepare_vm(struct xe_tile * tile,struct xe_migrate * m,struct xe_vm * vm,struct drm_exec * exec)184 static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,
185 				 struct xe_vm *vm, struct drm_exec *exec)
186 {
187 	struct xe_device *xe = tile_to_xe(tile);
188 	u16 pat_index = xe->pat.idx[XE_CACHE_WB];
189 	u8 id = tile->id;
190 	u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level;
191 #define VRAM_IDENTITY_MAP_COUNT	2
192 	u32 num_setup = num_level + VRAM_IDENTITY_MAP_COUNT;
193 #undef VRAM_IDENTITY_MAP_COUNT
194 	u32 map_ofs, level, i;
195 	struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo;
196 	u64 entry, pt29_ofs;
197 
198 	/* Can't bump NUM_PT_SLOTS too high */
199 	BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE);
200 	/* Must be a multiple of 64K to support all platforms */
201 	BUILD_BUG_ON(NUM_PT_SLOTS * XE_PAGE_SIZE % SZ_64K);
202 	/* And one slot reserved for the 4KiB page table updates */
203 	BUILD_BUG_ON(!(NUM_KERNEL_PDE & 1));
204 
205 	/* Need to be sure everything fits in the first PT, or create more */
206 	xe_tile_assert(tile, m->batch_base_ofs + xe_bo_size(batch) < SZ_2M);
207 
208 	bo = xe_bo_create_pin_map(vm->xe, tile, vm,
209 				  num_entries * XE_PAGE_SIZE,
210 				  ttm_bo_type_kernel,
211 				  XE_BO_FLAG_VRAM_IF_DGFX(tile) |
212 				  XE_BO_FLAG_PAGETABLE, exec);
213 	if (IS_ERR(bo))
214 		return PTR_ERR(bo);
215 
216 	/* PT30 & PT31 reserved for 2M identity map */
217 	pt29_ofs = xe_bo_size(bo) - 3 * XE_PAGE_SIZE;
218 	entry = vm->pt_ops->pde_encode_bo(bo, pt29_ofs);
219 	xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry);
220 
221 	map_ofs = (num_entries - num_setup) * XE_PAGE_SIZE;
222 
223 	/* Map the entire BO in our level 0 pt */
224 	for (i = 0, level = 0; i < num_entries; level++) {
225 		entry = vm->pt_ops->pte_encode_bo(bo, i * XE_PAGE_SIZE,
226 						  pat_index, 0);
227 
228 		xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, entry);
229 
230 		if (vm->flags & XE_VM_FLAG_64K)
231 			i += 16;
232 		else
233 			i += 1;
234 	}
235 
236 	if (!IS_DGFX(xe)) {
237 		/* Write out batch too */
238 		m->batch_base_ofs = NUM_PT_SLOTS * XE_PAGE_SIZE;
239 		for (i = 0; i < xe_bo_size(batch);
240 		     i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE :
241 		     XE_PAGE_SIZE) {
242 			entry = vm->pt_ops->pte_encode_bo(batch, i,
243 							  pat_index, 0);
244 
245 			xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64,
246 				  entry);
247 			level++;
248 		}
249 		if (xe->info.has_usm) {
250 			xe_tile_assert(tile, xe_bo_size(batch) == SZ_1M);
251 
252 			batch = tile->primary_gt->usm.bb_pool->bo;
253 			m->usm_batch_base_ofs = m->batch_base_ofs + SZ_1M;
254 			xe_tile_assert(tile, xe_bo_size(batch) == SZ_512K);
255 
256 			for (i = 0; i < xe_bo_size(batch);
257 			     i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE :
258 			     XE_PAGE_SIZE) {
259 				entry = vm->pt_ops->pte_encode_bo(batch, i,
260 								  pat_index, 0);
261 
262 				xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64,
263 					  entry);
264 				level++;
265 			}
266 		}
267 	} else {
268 		u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE);
269 
270 		m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false);
271 
272 		if (xe->info.has_usm) {
273 			batch = tile->primary_gt->usm.bb_pool->bo;
274 			batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE);
275 			m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false);
276 		}
277 	}
278 
279 	for (level = 1; level < num_level; level++) {
280 		u32 flags = 0;
281 
282 		if (vm->flags & XE_VM_FLAG_64K && level == 1)
283 			flags = XE_PDE_64K;
284 
285 		entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (u64)(level - 1) *
286 						  XE_PAGE_SIZE);
287 		xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level, u64,
288 			  entry | flags);
289 	}
290 
291 	/* Write PDE's that point to our BO. */
292 	for (i = 0; i < map_ofs / XE_PAGE_SIZE; i++) {
293 		entry = vm->pt_ops->pde_encode_bo(bo, (u64)i * XE_PAGE_SIZE);
294 
295 		xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE +
296 			  (i + 1) * 8, u64, entry);
297 	}
298 
299 	/* Reserve 2M PDEs */
300 	level = 1;
301 	m->large_page_copy_ofs = NUM_PT_SLOTS << xe_pt_shift(level);
302 	m->large_page_copy_pdes = map_ofs + XE_PAGE_SIZE * level +
303 		NUM_PT_SLOTS * 8;
304 
305 	/* Set up a 1GiB NULL mapping at 255GiB offset. */
306 	level = 2;
307 	xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level + 255 * 8, u64,
308 		  vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, IS_DGFX(xe), 0)
309 		  | XE_PTE_NULL);
310 	m->cleared_mem_ofs = (255ULL << xe_pt_shift(level));
311 
312 	/* Identity map the entire vram at 256GiB offset */
313 	if (IS_DGFX(xe)) {
314 		u64 pt30_ofs = xe_bo_size(bo) - 2 * XE_PAGE_SIZE;
315 		resource_size_t actual_phy_size = xe_vram_region_actual_physical_size(xe->mem.vram);
316 
317 		xe_migrate_program_identity(xe, vm, bo, map_ofs, IDENTITY_OFFSET,
318 					    pat_index, pt30_ofs);
319 		xe_assert(xe, actual_phy_size <= (MAX_NUM_PTE - IDENTITY_OFFSET) * SZ_1G);
320 
321 		/*
322 		 * Identity map the entire vram for compressed pat_index for xe2+
323 		 * if flat ccs is enabled.
324 		 */
325 		if (GRAPHICS_VER(xe) >= 20 && xe_device_has_flat_ccs(xe)) {
326 			u16 comp_pat_index = xe->pat.idx[XE_CACHE_NONE_COMPRESSION];
327 			u64 vram_offset = IDENTITY_OFFSET +
328 				DIV_ROUND_UP_ULL(actual_phy_size, SZ_1G);
329 			u64 pt31_ofs = xe_bo_size(bo) - XE_PAGE_SIZE;
330 
331 			xe_assert(xe, actual_phy_size <= (MAX_NUM_PTE - IDENTITY_OFFSET -
332 							  IDENTITY_OFFSET / 2) * SZ_1G);
333 			xe_migrate_program_identity(xe, vm, bo, map_ofs, vram_offset,
334 						    comp_pat_index, pt31_ofs);
335 		}
336 	}
337 
338 	/*
339 	 * Example layout created above, with root level = 3:
340 	 * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's
341 	 * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's
342 	 * [PT9...PT26]: Userspace PT's for VM_BIND, 4 KiB PTE's
343 	 * [PT27 = PDE 0] [PT28 = PDE 1] [PT29 = PDE 2] [PT30 & PT31 = 2M vram identity map]
344 	 *
345 	 * This makes the lowest part of the VM point to the pagetables.
346 	 * Hence the lowest 2M in the vm should point to itself, with a few writes
347 	 * and flushes, other parts of the VM can be used either for copying and
348 	 * clearing.
349 	 *
350 	 * For performance, the kernel reserves PDE's, so about 20 are left
351 	 * for async VM updates.
352 	 *
353 	 * To make it easier to work, each scratch PT is put in slot (1 + PT #)
354 	 * everywhere, this allows lockless updates to scratch pages by using
355 	 * the different addresses in VM.
356 	 */
357 #define NUM_VMUSA_UNIT_PER_PAGE	32
358 #define VM_SA_UPDATE_UNIT_SIZE		(XE_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE)
359 #define NUM_VMUSA_WRITES_PER_UNIT	(VM_SA_UPDATE_UNIT_SIZE / sizeof(u64))
360 	drm_suballoc_manager_init(&m->vm_update_sa,
361 				  (size_t)(map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) *
362 				  NUM_VMUSA_UNIT_PER_PAGE, 0);
363 
364 	m->pt_bo = bo;
365 	return 0;
366 }
367 
368 /*
369  * Including the reserved copy engine is required to avoid deadlocks due to
370  * migrate jobs servicing the faults gets stuck behind the job that faulted.
371  */
xe_migrate_usm_logical_mask(struct xe_gt * gt)372 static u32 xe_migrate_usm_logical_mask(struct xe_gt *gt)
373 {
374 	u32 logical_mask = 0;
375 	struct xe_hw_engine *hwe;
376 	enum xe_hw_engine_id id;
377 
378 	for_each_hw_engine(hwe, gt, id) {
379 		if (hwe->class != XE_ENGINE_CLASS_COPY)
380 			continue;
381 
382 		if (xe_gt_is_usm_hwe(gt, hwe))
383 			logical_mask |= BIT(hwe->logical_instance);
384 	}
385 
386 	return logical_mask;
387 }
388 
xe_migrate_needs_ccs_emit(struct xe_device * xe)389 static bool xe_migrate_needs_ccs_emit(struct xe_device *xe)
390 {
391 	return xe_device_has_flat_ccs(xe) && !(GRAPHICS_VER(xe) >= 20 && IS_DGFX(xe));
392 }
393 
394 /**
395  * xe_migrate_alloc - Allocate a migrate struct for a given &xe_tile
396  * @tile: &xe_tile
397  *
398  * Allocates a &xe_migrate for a given tile.
399  *
400  * Return: &xe_migrate on success, or NULL when out of memory.
401  */
xe_migrate_alloc(struct xe_tile * tile)402 struct xe_migrate *xe_migrate_alloc(struct xe_tile *tile)
403 {
404 	struct xe_migrate *m = drmm_kzalloc(&tile_to_xe(tile)->drm, sizeof(*m), GFP_KERNEL);
405 
406 	if (m)
407 		m->tile = tile;
408 	return m;
409 }
410 
xe_migrate_lock_prepare_vm(struct xe_tile * tile,struct xe_migrate * m,struct xe_vm * vm)411 static int xe_migrate_lock_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, struct xe_vm *vm)
412 {
413 	struct xe_device *xe = tile_to_xe(tile);
414 	struct xe_validation_ctx ctx;
415 	struct drm_exec exec;
416 	int err = 0;
417 
418 	xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {}, err) {
419 		err = xe_vm_drm_exec_lock(vm, &exec);
420 		drm_exec_retry_on_contention(&exec);
421 		err = xe_migrate_prepare_vm(tile, m, vm, &exec);
422 		drm_exec_retry_on_contention(&exec);
423 		xe_validation_retry_on_oom(&ctx, &err);
424 	}
425 
426 	return err;
427 }
428 
429 /**
430  * xe_migrate_init() - Initialize a migrate context
431  * @m: The migration context
432  *
433  * Return: 0 if successful, negative error code on failure
434  */
xe_migrate_init(struct xe_migrate * m)435 int xe_migrate_init(struct xe_migrate *m)
436 {
437 	struct xe_tile *tile = m->tile;
438 	struct xe_gt *primary_gt = tile->primary_gt;
439 	struct xe_device *xe = tile_to_xe(tile);
440 	struct xe_vm *vm;
441 	int err;
442 
443 	/* Special layout, prepared below.. */
444 	vm = xe_vm_create(xe, XE_VM_FLAG_MIGRATION |
445 			  XE_VM_FLAG_SET_TILE_ID(tile), NULL);
446 	if (IS_ERR(vm))
447 		return PTR_ERR(vm);
448 
449 	err = xe_migrate_lock_prepare_vm(tile, m, vm);
450 	if (err)
451 		goto err_out;
452 
453 	if (xe->info.has_usm) {
454 		struct xe_hw_engine *hwe = xe_gt_hw_engine(primary_gt,
455 							   XE_ENGINE_CLASS_COPY,
456 							   primary_gt->usm.reserved_bcs_instance,
457 							   false);
458 		u32 logical_mask = xe_migrate_usm_logical_mask(primary_gt);
459 
460 		if (!hwe || !logical_mask) {
461 			err = -EINVAL;
462 			goto err_out;
463 		}
464 
465 		/*
466 		 * XXX: Currently only reserving 1 (likely slow) BCS instance on
467 		 * PVC, may want to revisit if performance is needed.
468 		 */
469 		m->q = xe_exec_queue_create(xe, vm, logical_mask, 1, hwe,
470 					    EXEC_QUEUE_FLAG_KERNEL |
471 					    EXEC_QUEUE_FLAG_PERMANENT |
472 					    EXEC_QUEUE_FLAG_HIGH_PRIORITY |
473 					    EXEC_QUEUE_FLAG_MIGRATE, 0);
474 	} else {
475 		m->q = xe_exec_queue_create_class(xe, primary_gt, vm,
476 						  XE_ENGINE_CLASS_COPY,
477 						  EXEC_QUEUE_FLAG_KERNEL |
478 						  EXEC_QUEUE_FLAG_PERMANENT |
479 						  EXEC_QUEUE_FLAG_MIGRATE, 0);
480 	}
481 	if (IS_ERR(m->q)) {
482 		err = PTR_ERR(m->q);
483 		goto err_out;
484 	}
485 
486 	mutex_init(&m->job_mutex);
487 	fs_reclaim_acquire(GFP_KERNEL);
488 	might_lock(&m->job_mutex);
489 	fs_reclaim_release(GFP_KERNEL);
490 
491 	err = devm_add_action_or_reset(xe->drm.dev, xe_migrate_fini, m);
492 	if (err)
493 		return err;
494 
495 	if (IS_DGFX(xe)) {
496 		if (xe_migrate_needs_ccs_emit(xe))
497 			/* min chunk size corresponds to 4K of CCS Metadata */
498 			m->min_chunk_size = SZ_4K * SZ_64K /
499 				xe_device_ccs_bytes(xe, SZ_64K);
500 		else
501 			/* Somewhat arbitrary to avoid a huge amount of blits */
502 			m->min_chunk_size = SZ_64K;
503 		m->min_chunk_size = roundup_pow_of_two(m->min_chunk_size);
504 		drm_dbg(&xe->drm, "Migrate min chunk size is 0x%08llx\n",
505 			(unsigned long long)m->min_chunk_size);
506 	}
507 
508 	return err;
509 
510 err_out:
511 	xe_vm_close_and_put(vm);
512 	return err;
513 
514 }
515 
max_mem_transfer_per_pass(struct xe_device * xe)516 static u64 max_mem_transfer_per_pass(struct xe_device *xe)
517 {
518 	if (!IS_DGFX(xe) && xe_device_has_flat_ccs(xe))
519 		return MAX_CCS_LIMITED_TRANSFER;
520 
521 	return MAX_PREEMPTDISABLE_TRANSFER;
522 }
523 
xe_migrate_res_sizes(struct xe_migrate * m,struct xe_res_cursor * cur)524 static u64 xe_migrate_res_sizes(struct xe_migrate *m, struct xe_res_cursor *cur)
525 {
526 	struct xe_device *xe = tile_to_xe(m->tile);
527 	u64 size = min_t(u64, max_mem_transfer_per_pass(xe), cur->remaining);
528 
529 	if (mem_type_is_vram(cur->mem_type)) {
530 		/*
531 		 * VRAM we want to blit in chunks with sizes aligned to
532 		 * min_chunk_size in order for the offset to CCS metadata to be
533 		 * page-aligned. If it's the last chunk it may be smaller.
534 		 *
535 		 * Another constraint is that we need to limit the blit to
536 		 * the VRAM block size, unless size is smaller than
537 		 * min_chunk_size.
538 		 */
539 		u64 chunk = max_t(u64, cur->size, m->min_chunk_size);
540 
541 		size = min_t(u64, size, chunk);
542 		if (size > m->min_chunk_size)
543 			size = round_down(size, m->min_chunk_size);
544 	}
545 
546 	return size;
547 }
548 
xe_migrate_allow_identity(u64 size,const struct xe_res_cursor * cur)549 static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur)
550 {
551 	/* If the chunk is not fragmented, allow identity map. */
552 	return cur->size >= size;
553 }
554 
555 #define PTE_UPDATE_FLAG_IS_VRAM		BIT(0)
556 #define PTE_UPDATE_FLAG_IS_COMP_PTE	BIT(1)
557 
pte_update_size(struct xe_migrate * m,u32 flags,struct ttm_resource * res,struct xe_res_cursor * cur,u64 * L0,u64 * L0_ofs,u32 * L0_pt,u32 cmd_size,u32 pt_ofs,u32 avail_pts)558 static u32 pte_update_size(struct xe_migrate *m,
559 			   u32 flags,
560 			   struct ttm_resource *res,
561 			   struct xe_res_cursor *cur,
562 			   u64 *L0, u64 *L0_ofs, u32 *L0_pt,
563 			   u32 cmd_size, u32 pt_ofs, u32 avail_pts)
564 {
565 	u32 cmds = 0;
566 	bool is_vram = PTE_UPDATE_FLAG_IS_VRAM & flags;
567 	bool is_comp_pte = PTE_UPDATE_FLAG_IS_COMP_PTE & flags;
568 
569 	*L0_pt = pt_ofs;
570 	if (is_vram && xe_migrate_allow_identity(*L0, cur)) {
571 		/* Offset into identity map. */
572 		*L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile),
573 					      cur->start + vram_region_gpu_offset(res),
574 					      is_comp_pte);
575 		cmds += cmd_size;
576 	} else {
577 		/* Clip L0 to available size */
578 		u64 size = min(*L0, (u64)avail_pts * SZ_2M);
579 		u32 num_4k_pages = (size + XE_PAGE_SIZE - 1) >> XE_PTE_SHIFT;
580 
581 		*L0 = size;
582 		*L0_ofs = xe_migrate_vm_addr(pt_ofs, 0);
583 
584 		/* MI_STORE_DATA_IMM */
585 		cmds += 3 * DIV_ROUND_UP(num_4k_pages, MAX_PTE_PER_SDI);
586 
587 		/* PDE qwords */
588 		cmds += num_4k_pages * 2;
589 
590 		/* Each chunk has a single blit command */
591 		cmds += cmd_size;
592 	}
593 
594 	return cmds;
595 }
596 
emit_pte(struct xe_migrate * m,struct xe_bb * bb,u32 at_pt,bool is_vram,bool is_comp_pte,struct xe_res_cursor * cur,u32 size,struct ttm_resource * res)597 static void emit_pte(struct xe_migrate *m,
598 		     struct xe_bb *bb, u32 at_pt,
599 		     bool is_vram, bool is_comp_pte,
600 		     struct xe_res_cursor *cur,
601 		     u32 size, struct ttm_resource *res)
602 {
603 	struct xe_device *xe = tile_to_xe(m->tile);
604 	struct xe_vm *vm = m->q->vm;
605 	u16 pat_index;
606 	u32 ptes;
607 	u64 ofs = (u64)at_pt * XE_PAGE_SIZE;
608 	u64 cur_ofs;
609 
610 	/* Indirect access needs compression enabled uncached PAT index */
611 	if (GRAPHICS_VERx100(xe) >= 2000)
612 		pat_index = is_comp_pte ? xe->pat.idx[XE_CACHE_NONE_COMPRESSION] :
613 					  xe->pat.idx[XE_CACHE_WB];
614 	else
615 		pat_index = xe->pat.idx[XE_CACHE_WB];
616 
617 	ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE);
618 
619 	while (ptes) {
620 		u32 chunk = min(MAX_PTE_PER_SDI, ptes);
621 
622 		bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk);
623 		bb->cs[bb->len++] = ofs;
624 		bb->cs[bb->len++] = 0;
625 
626 		cur_ofs = ofs;
627 		ofs += chunk * 8;
628 		ptes -= chunk;
629 
630 		while (chunk--) {
631 			u64 addr, flags = 0;
632 			bool devmem = false;
633 
634 			addr = xe_res_dma(cur) & PAGE_MASK;
635 			if (is_vram) {
636 				if (vm->flags & XE_VM_FLAG_64K) {
637 					u64 va = cur_ofs * XE_PAGE_SIZE / 8;
638 
639 					xe_assert(xe, (va & (SZ_64K - 1)) ==
640 						  (addr & (SZ_64K - 1)));
641 
642 					flags |= XE_PTE_PS64;
643 				}
644 
645 				addr += vram_region_gpu_offset(res);
646 				devmem = true;
647 			}
648 
649 			addr = vm->pt_ops->pte_encode_addr(m->tile->xe,
650 							   addr, pat_index,
651 							   0, devmem, flags);
652 			bb->cs[bb->len++] = lower_32_bits(addr);
653 			bb->cs[bb->len++] = upper_32_bits(addr);
654 
655 			xe_res_next(cur, min_t(u32, size, PAGE_SIZE));
656 			cur_ofs += 8;
657 		}
658 	}
659 }
660 
661 #define EMIT_COPY_CCS_DW 5
emit_copy_ccs(struct xe_gt * gt,struct xe_bb * bb,u64 dst_ofs,bool dst_is_indirect,u64 src_ofs,bool src_is_indirect,u32 size)662 static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
663 			  u64 dst_ofs, bool dst_is_indirect,
664 			  u64 src_ofs, bool src_is_indirect,
665 			  u32 size)
666 {
667 	struct xe_device *xe = gt_to_xe(gt);
668 	u32 *cs = bb->cs + bb->len;
669 	u32 num_ccs_blks;
670 	u32 num_pages;
671 	u32 ccs_copy_size;
672 	u32 mocs;
673 
674 	if (GRAPHICS_VERx100(xe) >= 2000) {
675 		num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
676 		xe_gt_assert(gt, FIELD_FIT(XE2_CCS_SIZE_MASK, num_pages - 1));
677 
678 		ccs_copy_size = REG_FIELD_PREP(XE2_CCS_SIZE_MASK, num_pages - 1);
679 		mocs = FIELD_PREP(XE2_XY_CTRL_SURF_MOCS_INDEX_MASK, gt->mocs.uc_index);
680 
681 	} else {
682 		num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size),
683 					    NUM_CCS_BYTES_PER_BLOCK);
684 		xe_gt_assert(gt, FIELD_FIT(CCS_SIZE_MASK, num_ccs_blks - 1));
685 
686 		ccs_copy_size = REG_FIELD_PREP(CCS_SIZE_MASK, num_ccs_blks - 1);
687 		mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index);
688 	}
689 
690 	*cs++ = XY_CTRL_SURF_COPY_BLT |
691 		(src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
692 		(dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
693 		ccs_copy_size;
694 	*cs++ = lower_32_bits(src_ofs);
695 	*cs++ = upper_32_bits(src_ofs) | mocs;
696 	*cs++ = lower_32_bits(dst_ofs);
697 	*cs++ = upper_32_bits(dst_ofs) | mocs;
698 
699 	bb->len = cs - bb->cs;
700 }
701 
702 #define EMIT_COPY_DW 10
emit_xy_fast_copy(struct xe_gt * gt,struct xe_bb * bb,u64 src_ofs,u64 dst_ofs,unsigned int size,unsigned int pitch)703 static void emit_xy_fast_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
704 			      u64 dst_ofs, unsigned int size,
705 			      unsigned int pitch)
706 {
707 	struct xe_device *xe = gt_to_xe(gt);
708 	u32 mocs = 0;
709 	u32 tile_y = 0;
710 
711 	xe_gt_assert(gt, !(pitch & 3));
712 	xe_gt_assert(gt, size / pitch <= S16_MAX);
713 	xe_gt_assert(gt, pitch / 4 <= S16_MAX);
714 	xe_gt_assert(gt, pitch <= U16_MAX);
715 
716 	if (GRAPHICS_VER(xe) >= 20)
717 		mocs = FIELD_PREP(XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index);
718 
719 	if (GRAPHICS_VERx100(xe) >= 1250)
720 		tile_y = XY_FAST_COPY_BLT_D1_SRC_TILE4 | XY_FAST_COPY_BLT_D1_DST_TILE4;
721 
722 	bb->cs[bb->len++] = XY_FAST_COPY_BLT_CMD | (10 - 2);
723 	bb->cs[bb->len++] = XY_FAST_COPY_BLT_DEPTH_32 | pitch | tile_y | mocs;
724 	bb->cs[bb->len++] = 0;
725 	bb->cs[bb->len++] = (size / pitch) << 16 | pitch / 4;
726 	bb->cs[bb->len++] = lower_32_bits(dst_ofs);
727 	bb->cs[bb->len++] = upper_32_bits(dst_ofs);
728 	bb->cs[bb->len++] = 0;
729 	bb->cs[bb->len++] = pitch | mocs;
730 	bb->cs[bb->len++] = lower_32_bits(src_ofs);
731 	bb->cs[bb->len++] = upper_32_bits(src_ofs);
732 }
733 
734 #define PAGE_COPY_MODE_PS SZ_256 /* hw uses 256 bytes as the page-size */
emit_mem_copy(struct xe_gt * gt,struct xe_bb * bb,u64 src_ofs,u64 dst_ofs,unsigned int size,unsigned int pitch)735 static void emit_mem_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
736 			  u64 dst_ofs, unsigned int size, unsigned int pitch)
737 {
738 	u32 mode, copy_type, width;
739 
740 	xe_gt_assert(gt, IS_ALIGNED(size, pitch));
741 	xe_gt_assert(gt, pitch <= U16_MAX);
742 	xe_gt_assert(gt, pitch);
743 	xe_gt_assert(gt, size);
744 
745 	if (IS_ALIGNED(size, PAGE_COPY_MODE_PS) &&
746 	    IS_ALIGNED(lower_32_bits(src_ofs), PAGE_COPY_MODE_PS) &&
747 	    IS_ALIGNED(lower_32_bits(dst_ofs), PAGE_COPY_MODE_PS)) {
748 		mode = MEM_COPY_PAGE_COPY_MODE;
749 		copy_type = 0; /* linear copy */
750 		width = size / PAGE_COPY_MODE_PS;
751 	} else if (pitch > 1) {
752 		xe_gt_assert(gt, size / pitch <= U16_MAX);
753 		mode = 0; /* BYTE_COPY */
754 		copy_type = MEM_COPY_MATRIX_COPY;
755 		width = pitch;
756 	} else {
757 		mode = 0; /* BYTE_COPY */
758 		copy_type = 0; /* linear copy */
759 		width = size;
760 	}
761 
762 	xe_gt_assert(gt, width <= U16_MAX);
763 
764 	bb->cs[bb->len++] = MEM_COPY_CMD | mode | copy_type;
765 	bb->cs[bb->len++] = width - 1;
766 	bb->cs[bb->len++] = size / pitch - 1; /* ignored by hw for page-copy/linear above */
767 	bb->cs[bb->len++] = pitch - 1;
768 	bb->cs[bb->len++] = pitch - 1;
769 	bb->cs[bb->len++] = lower_32_bits(src_ofs);
770 	bb->cs[bb->len++] = upper_32_bits(src_ofs);
771 	bb->cs[bb->len++] = lower_32_bits(dst_ofs);
772 	bb->cs[bb->len++] = upper_32_bits(dst_ofs);
773 	bb->cs[bb->len++] = FIELD_PREP(MEM_COPY_SRC_MOCS_INDEX_MASK, gt->mocs.uc_index) |
774 			    FIELD_PREP(MEM_COPY_DST_MOCS_INDEX_MASK, gt->mocs.uc_index);
775 }
776 
emit_copy(struct xe_gt * gt,struct xe_bb * bb,u64 src_ofs,u64 dst_ofs,unsigned int size,unsigned int pitch)777 static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
778 		      u64 src_ofs, u64 dst_ofs, unsigned int size,
779 		      unsigned int pitch)
780 {
781 	struct xe_device *xe = gt_to_xe(gt);
782 
783 	if (xe->info.has_mem_copy_instr)
784 		emit_mem_copy(gt, bb, src_ofs, dst_ofs, size, pitch);
785 	else
786 		emit_xy_fast_copy(gt, bb, src_ofs, dst_ofs, size, pitch);
787 }
788 
xe_migrate_batch_base(struct xe_migrate * m,bool usm)789 static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm)
790 {
791 	return usm ? m->usm_batch_base_ofs : m->batch_base_ofs;
792 }
793 
xe_migrate_ccs_copy(struct xe_migrate * m,struct xe_bb * bb,u64 src_ofs,bool src_is_indirect,u64 dst_ofs,bool dst_is_indirect,u32 dst_size,u64 ccs_ofs,bool copy_ccs)794 static u32 xe_migrate_ccs_copy(struct xe_migrate *m,
795 			       struct xe_bb *bb,
796 			       u64 src_ofs, bool src_is_indirect,
797 			       u64 dst_ofs, bool dst_is_indirect, u32 dst_size,
798 			       u64 ccs_ofs, bool copy_ccs)
799 {
800 	struct xe_gt *gt = m->tile->primary_gt;
801 	u32 flush_flags = 0;
802 
803 	if (!copy_ccs && dst_is_indirect) {
804 		/*
805 		 * If the src is already in vram, then it should already
806 		 * have been cleared by us, or has been populated by the
807 		 * user. Make sure we copy the CCS aux state as-is.
808 		 *
809 		 * Otherwise if the bo doesn't have any CCS metadata attached,
810 		 * we still need to clear it for security reasons.
811 		 */
812 		u64 ccs_src_ofs =  src_is_indirect ? src_ofs : m->cleared_mem_ofs;
813 
814 		emit_copy_ccs(gt, bb,
815 			      dst_ofs, true,
816 			      ccs_src_ofs, src_is_indirect, dst_size);
817 
818 		flush_flags = MI_FLUSH_DW_CCS;
819 	} else if (copy_ccs) {
820 		if (!src_is_indirect)
821 			src_ofs = ccs_ofs;
822 		else if (!dst_is_indirect)
823 			dst_ofs = ccs_ofs;
824 
825 		xe_gt_assert(gt, src_is_indirect || dst_is_indirect);
826 
827 		emit_copy_ccs(gt, bb, dst_ofs, dst_is_indirect, src_ofs,
828 			      src_is_indirect, dst_size);
829 		if (dst_is_indirect)
830 			flush_flags = MI_FLUSH_DW_CCS;
831 	}
832 
833 	return flush_flags;
834 }
835 
836 /**
837  * xe_migrate_copy() - Copy content of TTM resources.
838  * @m: The migration context.
839  * @src_bo: The buffer object @src is currently bound to.
840  * @dst_bo: If copying between resources created for the same bo, set this to
841  * the same value as @src_bo. If copying between buffer objects, set it to
842  * the buffer object @dst is currently bound to.
843  * @src: The source TTM resource.
844  * @dst: The dst TTM resource.
845  * @copy_only_ccs: If true copy only CCS metadata
846  *
847  * Copies the contents of @src to @dst: On flat CCS devices,
848  * the CCS metadata is copied as well if needed, or if not present,
849  * the CCS metadata of @dst is cleared for security reasons.
850  *
851  * Return: Pointer to a dma_fence representing the last copy batch, or
852  * an error pointer on failure. If there is a failure, any copy operation
853  * started by the function call has been synced.
854  */
xe_migrate_copy(struct xe_migrate * m,struct xe_bo * src_bo,struct xe_bo * dst_bo,struct ttm_resource * src,struct ttm_resource * dst,bool copy_only_ccs)855 struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
856 				  struct xe_bo *src_bo,
857 				  struct xe_bo *dst_bo,
858 				  struct ttm_resource *src,
859 				  struct ttm_resource *dst,
860 				  bool copy_only_ccs)
861 {
862 	struct xe_gt *gt = m->tile->primary_gt;
863 	struct xe_device *xe = gt_to_xe(gt);
864 	struct dma_fence *fence = NULL;
865 	u64 size = xe_bo_size(src_bo);
866 	struct xe_res_cursor src_it, dst_it, ccs_it;
867 	u64 src_L0_ofs, dst_L0_ofs;
868 	u32 src_L0_pt, dst_L0_pt;
869 	u64 src_L0, dst_L0;
870 	int pass = 0;
871 	int err;
872 	bool src_is_pltt = src->mem_type == XE_PL_TT;
873 	bool dst_is_pltt = dst->mem_type == XE_PL_TT;
874 	bool src_is_vram = mem_type_is_vram(src->mem_type);
875 	bool dst_is_vram = mem_type_is_vram(dst->mem_type);
876 	bool type_device = src_bo->ttm.type == ttm_bo_type_device;
877 	bool needs_ccs_emit = type_device && xe_migrate_needs_ccs_emit(xe);
878 	bool copy_ccs = xe_device_has_flat_ccs(xe) &&
879 		xe_bo_needs_ccs_pages(src_bo) && xe_bo_needs_ccs_pages(dst_bo);
880 	bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram);
881 	bool use_comp_pat = type_device && xe_device_has_flat_ccs(xe) &&
882 		GRAPHICS_VER(xe) >= 20 && src_is_vram && !dst_is_vram;
883 
884 	/* Copying CCS between two different BOs is not supported yet. */
885 	if (XE_WARN_ON(copy_ccs && src_bo != dst_bo))
886 		return ERR_PTR(-EINVAL);
887 
888 	if (src_bo != dst_bo && XE_WARN_ON(xe_bo_size(src_bo) != xe_bo_size(dst_bo)))
889 		return ERR_PTR(-EINVAL);
890 
891 	if (!src_is_vram)
892 		xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it);
893 	else
894 		xe_res_first(src, 0, size, &src_it);
895 	if (!dst_is_vram)
896 		xe_res_first_sg(xe_bo_sg(dst_bo), 0, size, &dst_it);
897 	else
898 		xe_res_first(dst, 0, size, &dst_it);
899 
900 	if (copy_system_ccs)
901 		xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo),
902 				PAGE_ALIGN(xe_device_ccs_bytes(xe, size)),
903 				&ccs_it);
904 
905 	while (size) {
906 		u32 batch_size = 1; /* MI_BATCH_BUFFER_END */
907 		struct xe_sched_job *job;
908 		struct xe_bb *bb;
909 		u32 flush_flags = 0;
910 		u32 update_idx;
911 		u64 ccs_ofs, ccs_size;
912 		u32 ccs_pt;
913 		u32 pte_flags;
914 
915 		bool usm = xe->info.has_usm;
916 		u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
917 
918 		src_L0 = xe_migrate_res_sizes(m, &src_it);
919 		dst_L0 = xe_migrate_res_sizes(m, &dst_it);
920 
921 		drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n",
922 			pass++, src_L0, dst_L0);
923 
924 		src_L0 = min(src_L0, dst_L0);
925 
926 		pte_flags = src_is_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0;
927 		pte_flags |= use_comp_pat ? PTE_UPDATE_FLAG_IS_COMP_PTE : 0;
928 		batch_size += pte_update_size(m, pte_flags, src, &src_it, &src_L0,
929 					      &src_L0_ofs, &src_L0_pt, 0, 0,
930 					      avail_pts);
931 		if (copy_only_ccs) {
932 			dst_L0_ofs = src_L0_ofs;
933 		} else {
934 			pte_flags = dst_is_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0;
935 			batch_size += pte_update_size(m, pte_flags, dst,
936 						      &dst_it, &src_L0,
937 						      &dst_L0_ofs, &dst_L0_pt,
938 						      0, avail_pts, avail_pts);
939 		}
940 
941 		if (copy_system_ccs) {
942 			xe_assert(xe, type_device);
943 			ccs_size = xe_device_ccs_bytes(xe, src_L0);
944 			batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size,
945 						      &ccs_ofs, &ccs_pt, 0,
946 						      2 * avail_pts,
947 						      avail_pts);
948 			xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));
949 		}
950 
951 		/* Add copy commands size here */
952 		batch_size += ((copy_only_ccs) ? 0 : EMIT_COPY_DW) +
953 			((needs_ccs_emit ? EMIT_COPY_CCS_DW : 0));
954 
955 		bb = xe_bb_new(gt, batch_size, usm);
956 		if (IS_ERR(bb)) {
957 			err = PTR_ERR(bb);
958 			goto err_sync;
959 		}
960 
961 		if (src_is_vram && xe_migrate_allow_identity(src_L0, &src_it))
962 			xe_res_next(&src_it, src_L0);
963 		else
964 			emit_pte(m, bb, src_L0_pt, src_is_vram, copy_system_ccs || use_comp_pat,
965 				 &src_it, src_L0, src);
966 
967 		if (dst_is_vram && xe_migrate_allow_identity(src_L0, &dst_it))
968 			xe_res_next(&dst_it, src_L0);
969 		else if (!copy_only_ccs)
970 			emit_pte(m, bb, dst_L0_pt, dst_is_vram, copy_system_ccs,
971 				 &dst_it, src_L0, dst);
972 
973 		if (copy_system_ccs)
974 			emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src);
975 
976 		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
977 		update_idx = bb->len;
978 
979 		if (!copy_only_ccs)
980 			emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, XE_PAGE_SIZE);
981 
982 		if (needs_ccs_emit)
983 			flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs,
984 							  IS_DGFX(xe) ? src_is_vram : src_is_pltt,
985 							  dst_L0_ofs,
986 							  IS_DGFX(xe) ? dst_is_vram : dst_is_pltt,
987 							  src_L0, ccs_ofs, copy_ccs);
988 
989 		job = xe_bb_create_migration_job(m->q, bb,
990 						 xe_migrate_batch_base(m, usm),
991 						 update_idx);
992 		if (IS_ERR(job)) {
993 			err = PTR_ERR(job);
994 			goto err;
995 		}
996 
997 		xe_sched_job_add_migrate_flush(job, flush_flags | MI_INVALIDATE_TLB);
998 		if (!fence) {
999 			err = xe_sched_job_add_deps(job, src_bo->ttm.base.resv,
1000 						    DMA_RESV_USAGE_BOOKKEEP);
1001 			if (!err && src_bo->ttm.base.resv != dst_bo->ttm.base.resv)
1002 				err = xe_sched_job_add_deps(job, dst_bo->ttm.base.resv,
1003 							    DMA_RESV_USAGE_BOOKKEEP);
1004 			if (err)
1005 				goto err_job;
1006 		}
1007 
1008 		mutex_lock(&m->job_mutex);
1009 		xe_sched_job_arm(job);
1010 		dma_fence_put(fence);
1011 		fence = dma_fence_get(&job->drm.s_fence->finished);
1012 		xe_sched_job_push(job);
1013 
1014 		dma_fence_put(m->fence);
1015 		m->fence = dma_fence_get(fence);
1016 
1017 		mutex_unlock(&m->job_mutex);
1018 
1019 		xe_bb_free(bb, fence);
1020 		size -= src_L0;
1021 		continue;
1022 
1023 err_job:
1024 		xe_sched_job_put(job);
1025 err:
1026 		xe_bb_free(bb, NULL);
1027 
1028 err_sync:
1029 		/* Sync partial copy if any. FIXME: under job_mutex? */
1030 		if (fence) {
1031 			dma_fence_wait(fence, false);
1032 			dma_fence_put(fence);
1033 		}
1034 
1035 		return ERR_PTR(err);
1036 	}
1037 
1038 	return fence;
1039 }
1040 
1041 /**
1042  * xe_migrate_lrc() - Get the LRC from migrate context.
1043  * @migrate: Migrate context.
1044  *
1045  * Return: Pointer to LRC on success, error on failure
1046  */
xe_migrate_lrc(struct xe_migrate * migrate)1047 struct xe_lrc *xe_migrate_lrc(struct xe_migrate *migrate)
1048 {
1049 	return migrate->q->lrc[0];
1050 }
1051 
migrate_vm_ppgtt_addr_tlb_inval(void)1052 static u64 migrate_vm_ppgtt_addr_tlb_inval(void)
1053 {
1054 	/*
1055 	 * The migrate VM is self-referential so it can modify its own PTEs (see
1056 	 * pte_update_size() or emit_pte() functions). We reserve NUM_KERNEL_PDE
1057 	 * entries for kernel operations (copies, clears, CCS migrate), and
1058 	 * suballocate the rest to user operations (binds/unbinds). With
1059 	 * NUM_KERNEL_PDE = 15, NUM_KERNEL_PDE - 1 is already used for PTE updates,
1060 	 * so assign NUM_KERNEL_PDE - 2 for TLB invalidation.
1061 	 */
1062 	return (NUM_KERNEL_PDE - 2) * XE_PAGE_SIZE;
1063 }
1064 
emit_flush_invalidate(u32 * dw,int i,u32 flags)1065 static int emit_flush_invalidate(u32 *dw, int i, u32 flags)
1066 {
1067 	u64 addr = migrate_vm_ppgtt_addr_tlb_inval();
1068 
1069 	dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
1070 		  MI_FLUSH_IMM_DW | flags;
1071 	dw[i++] = lower_32_bits(addr);
1072 	dw[i++] = upper_32_bits(addr);
1073 	dw[i++] = MI_NOOP;
1074 	dw[i++] = MI_NOOP;
1075 
1076 	return i;
1077 }
1078 
1079 /**
1080  * xe_migrate_ccs_rw_copy() - Copy content of TTM resources.
1081  * @tile: Tile whose migration context to be used.
1082  * @q : Execution to be used along with migration context.
1083  * @src_bo: The buffer object @src is currently bound to.
1084  * @read_write : Creates BB commands for CCS read/write.
1085  *
1086  * Creates batch buffer instructions to copy CCS metadata from CCS pool to
1087  * memory and vice versa.
1088  *
1089  * This function should only be called for IGPU.
1090  *
1091  * Return: 0 if successful, negative error code on failure.
1092  */
xe_migrate_ccs_rw_copy(struct xe_tile * tile,struct xe_exec_queue * q,struct xe_bo * src_bo,enum xe_sriov_vf_ccs_rw_ctxs read_write)1093 int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q,
1094 			   struct xe_bo *src_bo,
1095 			   enum xe_sriov_vf_ccs_rw_ctxs read_write)
1096 
1097 {
1098 	bool src_is_pltt = read_write == XE_SRIOV_VF_CCS_READ_CTX;
1099 	bool dst_is_pltt = read_write == XE_SRIOV_VF_CCS_WRITE_CTX;
1100 	struct ttm_resource *src = src_bo->ttm.resource;
1101 	struct xe_migrate *m = tile->migrate;
1102 	struct xe_gt *gt = tile->primary_gt;
1103 	u32 batch_size, batch_size_allocated;
1104 	struct xe_device *xe = gt_to_xe(gt);
1105 	struct xe_res_cursor src_it, ccs_it;
1106 	u64 size = xe_bo_size(src_bo);
1107 	struct xe_bb *bb = NULL;
1108 	u64 src_L0, src_L0_ofs;
1109 	u32 src_L0_pt;
1110 	int err;
1111 
1112 	xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it);
1113 
1114 	xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo),
1115 			PAGE_ALIGN(xe_device_ccs_bytes(xe, size)),
1116 			&ccs_it);
1117 
1118 	/* Calculate Batch buffer size */
1119 	batch_size = 0;
1120 	while (size) {
1121 		batch_size += 10; /* Flush + ggtt addr + 2 NOP */
1122 		u64 ccs_ofs, ccs_size;
1123 		u32 ccs_pt;
1124 
1125 		u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
1126 
1127 		src_L0 = min_t(u64, max_mem_transfer_per_pass(xe), size);
1128 
1129 		batch_size += pte_update_size(m, false, src, &src_it, &src_L0,
1130 					      &src_L0_ofs, &src_L0_pt, 0, 0,
1131 					      avail_pts);
1132 
1133 		ccs_size = xe_device_ccs_bytes(xe, src_L0);
1134 		batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, &ccs_ofs,
1135 					      &ccs_pt, 0, avail_pts, avail_pts);
1136 		xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));
1137 
1138 		/* Add copy commands size here */
1139 		batch_size += EMIT_COPY_CCS_DW;
1140 
1141 		size -= src_L0;
1142 	}
1143 
1144 	bb = xe_bb_ccs_new(gt, batch_size, read_write);
1145 	if (IS_ERR(bb)) {
1146 		drm_err(&xe->drm, "BB allocation failed.\n");
1147 		err = PTR_ERR(bb);
1148 		goto err_ret;
1149 	}
1150 
1151 	batch_size_allocated = batch_size;
1152 	size = xe_bo_size(src_bo);
1153 	batch_size = 0;
1154 
1155 	/*
1156 	 * Emit PTE and copy commands here.
1157 	 * The CCS copy command can only support limited size. If the size to be
1158 	 * copied is more than the limit, divide copy into chunks. So, calculate
1159 	 * sizes here again before copy command is emitted.
1160 	 */
1161 	while (size) {
1162 		batch_size += 10; /* Flush + ggtt addr + 2 NOP */
1163 		u32 flush_flags = 0;
1164 		u64 ccs_ofs, ccs_size;
1165 		u32 ccs_pt;
1166 
1167 		u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
1168 
1169 		src_L0 = xe_migrate_res_sizes(m, &src_it);
1170 
1171 		batch_size += pte_update_size(m, false, src, &src_it, &src_L0,
1172 					      &src_L0_ofs, &src_L0_pt, 0, 0,
1173 					      avail_pts);
1174 
1175 		ccs_size = xe_device_ccs_bytes(xe, src_L0);
1176 		batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, &ccs_ofs,
1177 					      &ccs_pt, 0, avail_pts, avail_pts);
1178 		xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));
1179 		batch_size += EMIT_COPY_CCS_DW;
1180 
1181 		emit_pte(m, bb, src_L0_pt, false, true, &src_it, src_L0, src);
1182 
1183 		emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src);
1184 
1185 		bb->len = emit_flush_invalidate(bb->cs, bb->len, flush_flags);
1186 		flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, src_is_pltt,
1187 						  src_L0_ofs, dst_is_pltt,
1188 						  src_L0, ccs_ofs, true);
1189 		bb->len = emit_flush_invalidate(bb->cs, bb->len, flush_flags);
1190 
1191 		size -= src_L0;
1192 	}
1193 
1194 	xe_assert(xe, (batch_size_allocated == bb->len));
1195 	src_bo->bb_ccs[read_write] = bb;
1196 
1197 	return 0;
1198 
1199 err_ret:
1200 	return err;
1201 }
1202 
1203 /**
1204  * xe_get_migrate_exec_queue() - Get the execution queue from migrate context.
1205  * @migrate: Migrate context.
1206  *
1207  * Return: Pointer to execution queue on success, error on failure
1208  */
xe_migrate_exec_queue(struct xe_migrate * migrate)1209 struct xe_exec_queue *xe_migrate_exec_queue(struct xe_migrate *migrate)
1210 {
1211 	return migrate->q;
1212 }
1213 
1214 /**
1215  * xe_migrate_vram_copy_chunk() - Copy a chunk of a VRAM buffer object.
1216  * @vram_bo: The VRAM buffer object.
1217  * @vram_offset: The VRAM offset.
1218  * @sysmem_bo: The sysmem buffer object.
1219  * @sysmem_offset: The sysmem offset.
1220  * @size: The size of VRAM chunk to copy.
1221  * @dir: The direction of the copy operation.
1222  *
1223  * Copies a portion of a buffer object between VRAM and system memory.
1224  * On Xe2 platforms that support flat CCS, VRAM data is decompressed when
1225  * copying to system memory.
1226  *
1227  * Return: Pointer to a dma_fence representing the last copy batch, or
1228  * an error pointer on failure. If there is a failure, any copy operation
1229  * started by the function call has been synced.
1230  */
xe_migrate_vram_copy_chunk(struct xe_bo * vram_bo,u64 vram_offset,struct xe_bo * sysmem_bo,u64 sysmem_offset,u64 size,enum xe_migrate_copy_dir dir)1231 struct dma_fence *xe_migrate_vram_copy_chunk(struct xe_bo *vram_bo, u64 vram_offset,
1232 					     struct xe_bo *sysmem_bo, u64 sysmem_offset,
1233 					     u64 size, enum xe_migrate_copy_dir dir)
1234 {
1235 	struct xe_device *xe = xe_bo_device(vram_bo);
1236 	struct xe_tile *tile = vram_bo->tile;
1237 	struct xe_gt *gt = tile->primary_gt;
1238 	struct xe_migrate *m = tile->migrate;
1239 	struct dma_fence *fence = NULL;
1240 	struct ttm_resource *vram = vram_bo->ttm.resource;
1241 	struct ttm_resource *sysmem = sysmem_bo->ttm.resource;
1242 	struct xe_res_cursor vram_it, sysmem_it;
1243 	u64 vram_L0_ofs, sysmem_L0_ofs;
1244 	u32 vram_L0_pt, sysmem_L0_pt;
1245 	u64 vram_L0, sysmem_L0;
1246 	bool to_sysmem = (dir == XE_MIGRATE_COPY_TO_SRAM);
1247 	bool use_comp_pat = to_sysmem &&
1248 		GRAPHICS_VER(xe) >= 20 && xe_device_has_flat_ccs(xe);
1249 	int pass = 0;
1250 	int err;
1251 
1252 	xe_assert(xe, IS_ALIGNED(vram_offset | sysmem_offset | size, PAGE_SIZE));
1253 	xe_assert(xe, xe_bo_is_vram(vram_bo));
1254 	xe_assert(xe, !xe_bo_is_vram(sysmem_bo));
1255 	xe_assert(xe, !range_overflows(vram_offset, size, (u64)vram_bo->ttm.base.size));
1256 	xe_assert(xe, !range_overflows(sysmem_offset, size, (u64)sysmem_bo->ttm.base.size));
1257 
1258 	xe_res_first(vram, vram_offset, size, &vram_it);
1259 	xe_res_first_sg(xe_bo_sg(sysmem_bo), sysmem_offset, size, &sysmem_it);
1260 
1261 	while (size) {
1262 		u32 pte_flags = PTE_UPDATE_FLAG_IS_VRAM;
1263 		u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */
1264 		struct xe_sched_job *job;
1265 		struct xe_bb *bb;
1266 		u32 update_idx;
1267 		bool usm = xe->info.has_usm;
1268 		u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
1269 
1270 		sysmem_L0 = xe_migrate_res_sizes(m, &sysmem_it);
1271 		vram_L0 = min(xe_migrate_res_sizes(m, &vram_it), sysmem_L0);
1272 
1273 		xe_dbg(xe, "Pass %u, size: %llu\n", pass++, vram_L0);
1274 
1275 		pte_flags |= use_comp_pat ? PTE_UPDATE_FLAG_IS_COMP_PTE : 0;
1276 		batch_size += pte_update_size(m, pte_flags, vram, &vram_it, &vram_L0,
1277 					      &vram_L0_ofs, &vram_L0_pt, 0, 0, avail_pts);
1278 
1279 		batch_size += pte_update_size(m, 0, sysmem, &sysmem_it, &vram_L0, &sysmem_L0_ofs,
1280 					      &sysmem_L0_pt, 0, avail_pts, avail_pts);
1281 		batch_size += EMIT_COPY_DW;
1282 
1283 		bb = xe_bb_new(gt, batch_size, usm);
1284 		if (IS_ERR(bb)) {
1285 			err = PTR_ERR(bb);
1286 			return ERR_PTR(err);
1287 		}
1288 
1289 		if (xe_migrate_allow_identity(vram_L0, &vram_it))
1290 			xe_res_next(&vram_it, vram_L0);
1291 		else
1292 			emit_pte(m, bb, vram_L0_pt, true, use_comp_pat, &vram_it, vram_L0, vram);
1293 
1294 		emit_pte(m, bb, sysmem_L0_pt, false, false, &sysmem_it, vram_L0, sysmem);
1295 
1296 		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
1297 		update_idx = bb->len;
1298 
1299 		if (to_sysmem)
1300 			emit_copy(gt, bb, vram_L0_ofs, sysmem_L0_ofs, vram_L0, XE_PAGE_SIZE);
1301 		else
1302 			emit_copy(gt, bb, sysmem_L0_ofs, vram_L0_ofs, vram_L0, XE_PAGE_SIZE);
1303 
1304 		job = xe_bb_create_migration_job(m->q, bb, xe_migrate_batch_base(m, usm),
1305 						 update_idx);
1306 		if (IS_ERR(job)) {
1307 			xe_bb_free(bb, NULL);
1308 			err = PTR_ERR(job);
1309 			return ERR_PTR(err);
1310 		}
1311 
1312 		xe_sched_job_add_migrate_flush(job, MI_INVALIDATE_TLB);
1313 
1314 		xe_assert(xe, dma_resv_test_signaled(vram_bo->ttm.base.resv,
1315 						     DMA_RESV_USAGE_BOOKKEEP));
1316 		xe_assert(xe, dma_resv_test_signaled(sysmem_bo->ttm.base.resv,
1317 						     DMA_RESV_USAGE_BOOKKEEP));
1318 
1319 		scoped_guard(mutex, &m->job_mutex) {
1320 			xe_sched_job_arm(job);
1321 			dma_fence_put(fence);
1322 			fence = dma_fence_get(&job->drm.s_fence->finished);
1323 			xe_sched_job_push(job);
1324 
1325 			dma_fence_put(m->fence);
1326 			m->fence = dma_fence_get(fence);
1327 		}
1328 
1329 		xe_bb_free(bb, fence);
1330 		size -= vram_L0;
1331 	}
1332 
1333 	return fence;
1334 }
1335 
emit_clear_link_copy(struct xe_gt * gt,struct xe_bb * bb,u64 src_ofs,u32 size,u32 pitch)1336 static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
1337 				 u32 size, u32 pitch)
1338 {
1339 	struct xe_device *xe = gt_to_xe(gt);
1340 	u32 *cs = bb->cs + bb->len;
1341 	u32 len = PVC_MEM_SET_CMD_LEN_DW;
1342 
1343 	*cs++ = PVC_MEM_SET_CMD | PVC_MEM_SET_MATRIX | (len - 2);
1344 	*cs++ = pitch - 1;
1345 	*cs++ = (size / pitch) - 1;
1346 	*cs++ = pitch - 1;
1347 	*cs++ = lower_32_bits(src_ofs);
1348 	*cs++ = upper_32_bits(src_ofs);
1349 	if (GRAPHICS_VERx100(xe) >= 2000)
1350 		*cs++ = FIELD_PREP(XE2_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index);
1351 	else
1352 		*cs++ = FIELD_PREP(PVC_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index);
1353 
1354 	xe_gt_assert(gt, cs - bb->cs == len + bb->len);
1355 
1356 	bb->len += len;
1357 }
1358 
emit_clear_main_copy(struct xe_gt * gt,struct xe_bb * bb,u64 src_ofs,u32 size,u32 pitch,bool is_vram)1359 static void emit_clear_main_copy(struct xe_gt *gt, struct xe_bb *bb,
1360 				 u64 src_ofs, u32 size, u32 pitch, bool is_vram)
1361 {
1362 	struct xe_device *xe = gt_to_xe(gt);
1363 	u32 *cs = bb->cs + bb->len;
1364 	u32 len = XY_FAST_COLOR_BLT_DW;
1365 
1366 	if (GRAPHICS_VERx100(xe) < 1250)
1367 		len = 11;
1368 
1369 	*cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 |
1370 		(len - 2);
1371 	if (GRAPHICS_VERx100(xe) >= 2000)
1372 		*cs++ = FIELD_PREP(XE2_XY_FAST_COLOR_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index) |
1373 			(pitch - 1);
1374 	else
1375 		*cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, gt->mocs.uc_index) |
1376 			(pitch - 1);
1377 	*cs++ = 0;
1378 	*cs++ = (size / pitch) << 16 | pitch / 4;
1379 	*cs++ = lower_32_bits(src_ofs);
1380 	*cs++ = upper_32_bits(src_ofs);
1381 	*cs++ = (is_vram ? 0x0 : 0x1) <<  XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT;
1382 	*cs++ = 0;
1383 	*cs++ = 0;
1384 	*cs++ = 0;
1385 	*cs++ = 0;
1386 
1387 	if (len > 11) {
1388 		*cs++ = 0;
1389 		*cs++ = 0;
1390 		*cs++ = 0;
1391 		*cs++ = 0;
1392 		*cs++ = 0;
1393 	}
1394 
1395 	xe_gt_assert(gt, cs - bb->cs == len + bb->len);
1396 
1397 	bb->len += len;
1398 }
1399 
has_service_copy_support(struct xe_gt * gt)1400 static bool has_service_copy_support(struct xe_gt *gt)
1401 {
1402 	/*
1403 	 * What we care about is whether the architecture was designed with
1404 	 * service copy functionality (specifically the new MEM_SET / MEM_COPY
1405 	 * instructions) so check the architectural engine list rather than the
1406 	 * actual list since these instructions are usable on BCS0 even if
1407 	 * all of the actual service copy engines (BCS1-BCS8) have been fused
1408 	 * off.
1409 	 */
1410 	return gt->info.engine_mask & GENMASK(XE_HW_ENGINE_BCS8,
1411 					      XE_HW_ENGINE_BCS1);
1412 }
1413 
emit_clear_cmd_len(struct xe_gt * gt)1414 static u32 emit_clear_cmd_len(struct xe_gt *gt)
1415 {
1416 	if (has_service_copy_support(gt))
1417 		return PVC_MEM_SET_CMD_LEN_DW;
1418 	else
1419 		return XY_FAST_COLOR_BLT_DW;
1420 }
1421 
emit_clear(struct xe_gt * gt,struct xe_bb * bb,u64 src_ofs,u32 size,u32 pitch,bool is_vram)1422 static void emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
1423 		       u32 size, u32 pitch, bool is_vram)
1424 {
1425 	if (has_service_copy_support(gt))
1426 		emit_clear_link_copy(gt, bb, src_ofs, size, pitch);
1427 	else
1428 		emit_clear_main_copy(gt, bb, src_ofs, size, pitch,
1429 				     is_vram);
1430 }
1431 
1432 /**
1433  * xe_migrate_clear() - Copy content of TTM resources.
1434  * @m: The migration context.
1435  * @bo: The buffer object @dst is currently bound to.
1436  * @dst: The dst TTM resource to be cleared.
1437  * @clear_flags: flags to specify which data to clear: CCS, BO, or both.
1438  *
1439  * Clear the contents of @dst to zero when XE_MIGRATE_CLEAR_FLAG_BO_DATA is set.
1440  * On flat CCS devices, the CCS metadata is cleared to zero with XE_MIGRATE_CLEAR_FLAG_CCS_DATA.
1441  * Set XE_MIGRATE_CLEAR_FLAG_FULL to clear bo as well as CCS metadata.
1442  * TODO: Eliminate the @bo argument.
1443  *
1444  * Return: Pointer to a dma_fence representing the last clear batch, or
1445  * an error pointer on failure. If there is a failure, any clear operation
1446  * started by the function call has been synced.
1447  */
xe_migrate_clear(struct xe_migrate * m,struct xe_bo * bo,struct ttm_resource * dst,u32 clear_flags)1448 struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
1449 				   struct xe_bo *bo,
1450 				   struct ttm_resource *dst,
1451 				   u32 clear_flags)
1452 {
1453 	bool clear_vram = mem_type_is_vram(dst->mem_type);
1454 	bool clear_bo_data = XE_MIGRATE_CLEAR_FLAG_BO_DATA & clear_flags;
1455 	bool clear_ccs = XE_MIGRATE_CLEAR_FLAG_CCS_DATA & clear_flags;
1456 	struct xe_gt *gt = m->tile->primary_gt;
1457 	struct xe_device *xe = gt_to_xe(gt);
1458 	bool clear_only_system_ccs = false;
1459 	struct dma_fence *fence = NULL;
1460 	u64 size = xe_bo_size(bo);
1461 	struct xe_res_cursor src_it;
1462 	struct ttm_resource *src = dst;
1463 	int err;
1464 
1465 	if (WARN_ON(!clear_bo_data && !clear_ccs))
1466 		return NULL;
1467 
1468 	if (!clear_bo_data && clear_ccs && !IS_DGFX(xe))
1469 		clear_only_system_ccs = true;
1470 
1471 	if (!clear_vram)
1472 		xe_res_first_sg(xe_bo_sg(bo), 0, xe_bo_size(bo), &src_it);
1473 	else
1474 		xe_res_first(src, 0, xe_bo_size(bo), &src_it);
1475 
1476 	while (size) {
1477 		u64 clear_L0_ofs;
1478 		u32 clear_L0_pt;
1479 		u32 flush_flags = 0;
1480 		u64 clear_L0;
1481 		struct xe_sched_job *job;
1482 		struct xe_bb *bb;
1483 		u32 batch_size, update_idx;
1484 		u32 pte_flags;
1485 
1486 		bool usm = xe->info.has_usm;
1487 		u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
1488 
1489 		clear_L0 = xe_migrate_res_sizes(m, &src_it);
1490 
1491 		/* Calculate final sizes and batch size.. */
1492 		pte_flags = clear_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0;
1493 		batch_size = 1 +
1494 			pte_update_size(m, pte_flags, src, &src_it,
1495 					&clear_L0, &clear_L0_ofs, &clear_L0_pt,
1496 					clear_bo_data ? emit_clear_cmd_len(gt) : 0, 0,
1497 					avail_pts);
1498 
1499 		if (xe_migrate_needs_ccs_emit(xe))
1500 			batch_size += EMIT_COPY_CCS_DW;
1501 
1502 		/* Clear commands */
1503 
1504 		if (WARN_ON_ONCE(!clear_L0))
1505 			break;
1506 
1507 		bb = xe_bb_new(gt, batch_size, usm);
1508 		if (IS_ERR(bb)) {
1509 			err = PTR_ERR(bb);
1510 			goto err_sync;
1511 		}
1512 
1513 		size -= clear_L0;
1514 		/* Preemption is enabled again by the ring ops. */
1515 		if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it)) {
1516 			xe_res_next(&src_it, clear_L0);
1517 		} else {
1518 			emit_pte(m, bb, clear_L0_pt, clear_vram,
1519 				 clear_only_system_ccs, &src_it, clear_L0, dst);
1520 			flush_flags |= MI_INVALIDATE_TLB;
1521 		}
1522 
1523 		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
1524 		update_idx = bb->len;
1525 
1526 		if (clear_bo_data)
1527 			emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, clear_vram);
1528 
1529 		if (xe_migrate_needs_ccs_emit(xe)) {
1530 			emit_copy_ccs(gt, bb, clear_L0_ofs, true,
1531 				      m->cleared_mem_ofs, false, clear_L0);
1532 			flush_flags |= MI_FLUSH_DW_CCS;
1533 		}
1534 
1535 		job = xe_bb_create_migration_job(m->q, bb,
1536 						 xe_migrate_batch_base(m, usm),
1537 						 update_idx);
1538 		if (IS_ERR(job)) {
1539 			err = PTR_ERR(job);
1540 			goto err;
1541 		}
1542 
1543 		xe_sched_job_add_migrate_flush(job, flush_flags);
1544 		if (!fence) {
1545 			/*
1546 			 * There can't be anything userspace related at this
1547 			 * point, so we just need to respect any potential move
1548 			 * fences, which are always tracked as
1549 			 * DMA_RESV_USAGE_KERNEL.
1550 			 */
1551 			err = xe_sched_job_add_deps(job, bo->ttm.base.resv,
1552 						    DMA_RESV_USAGE_KERNEL);
1553 			if (err)
1554 				goto err_job;
1555 		}
1556 
1557 		mutex_lock(&m->job_mutex);
1558 		xe_sched_job_arm(job);
1559 		dma_fence_put(fence);
1560 		fence = dma_fence_get(&job->drm.s_fence->finished);
1561 		xe_sched_job_push(job);
1562 
1563 		dma_fence_put(m->fence);
1564 		m->fence = dma_fence_get(fence);
1565 
1566 		mutex_unlock(&m->job_mutex);
1567 
1568 		xe_bb_free(bb, fence);
1569 		continue;
1570 
1571 err_job:
1572 		xe_sched_job_put(job);
1573 err:
1574 		xe_bb_free(bb, NULL);
1575 err_sync:
1576 		/* Sync partial copies if any. FIXME: job_mutex? */
1577 		if (fence) {
1578 			dma_fence_wait(fence, false);
1579 			dma_fence_put(fence);
1580 		}
1581 
1582 		return ERR_PTR(err);
1583 	}
1584 
1585 	if (clear_ccs)
1586 		bo->ccs_cleared = true;
1587 
1588 	return fence;
1589 }
1590 
write_pgtable(struct xe_tile * tile,struct xe_bb * bb,u64 ppgtt_ofs,const struct xe_vm_pgtable_update_op * pt_op,const struct xe_vm_pgtable_update * update,struct xe_migrate_pt_update * pt_update)1591 static void write_pgtable(struct xe_tile *tile, struct xe_bb *bb, u64 ppgtt_ofs,
1592 			  const struct xe_vm_pgtable_update_op *pt_op,
1593 			  const struct xe_vm_pgtable_update *update,
1594 			  struct xe_migrate_pt_update *pt_update)
1595 {
1596 	const struct xe_migrate_pt_update_ops *ops = pt_update->ops;
1597 	u32 chunk;
1598 	u32 ofs = update->ofs, size = update->qwords;
1599 
1600 	/*
1601 	 * If we have 512 entries (max), we would populate it ourselves,
1602 	 * and update the PDE above it to the new pointer.
1603 	 * The only time this can only happen if we have to update the top
1604 	 * PDE. This requires a BO that is almost vm->size big.
1605 	 *
1606 	 * This shouldn't be possible in practice.. might change when 16K
1607 	 * pages are used. Hence the assert.
1608 	 */
1609 	xe_tile_assert(tile, update->qwords < MAX_NUM_PTE);
1610 	if (!ppgtt_ofs)
1611 		ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile),
1612 						xe_bo_addr(update->pt_bo, 0,
1613 							   XE_PAGE_SIZE), false);
1614 
1615 	do {
1616 		u64 addr = ppgtt_ofs + ofs * 8;
1617 
1618 		chunk = min(size, MAX_PTE_PER_SDI);
1619 
1620 		/* Ensure populatefn can do memset64 by aligning bb->cs */
1621 		if (!(bb->len & 1))
1622 			bb->cs[bb->len++] = MI_NOOP;
1623 
1624 		bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk);
1625 		bb->cs[bb->len++] = lower_32_bits(addr);
1626 		bb->cs[bb->len++] = upper_32_bits(addr);
1627 		if (pt_op->bind)
1628 			ops->populate(pt_update, tile, NULL, bb->cs + bb->len,
1629 				      ofs, chunk, update);
1630 		else
1631 			ops->clear(pt_update, tile, NULL, bb->cs + bb->len,
1632 				   ofs, chunk, update);
1633 
1634 		bb->len += chunk * 2;
1635 		ofs += chunk;
1636 		size -= chunk;
1637 	} while (size);
1638 }
1639 
xe_migrate_get_vm(struct xe_migrate * m)1640 struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m)
1641 {
1642 	return xe_vm_get(m->q->vm);
1643 }
1644 
1645 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
1646 struct migrate_test_params {
1647 	struct xe_test_priv base;
1648 	bool force_gpu;
1649 };
1650 
1651 #define to_migrate_test_params(_priv) \
1652 	container_of(_priv, struct migrate_test_params, base)
1653 #endif
1654 
1655 static struct dma_fence *
xe_migrate_update_pgtables_cpu(struct xe_migrate * m,struct xe_migrate_pt_update * pt_update)1656 xe_migrate_update_pgtables_cpu(struct xe_migrate *m,
1657 			       struct xe_migrate_pt_update *pt_update)
1658 {
1659 	XE_TEST_DECLARE(struct migrate_test_params *test =
1660 			to_migrate_test_params
1661 			(xe_cur_kunit_priv(XE_TEST_LIVE_MIGRATE));)
1662 	const struct xe_migrate_pt_update_ops *ops = pt_update->ops;
1663 	struct xe_vm *vm = pt_update->vops->vm;
1664 	struct xe_vm_pgtable_update_ops *pt_update_ops =
1665 		&pt_update->vops->pt_update_ops[pt_update->tile_id];
1666 	int err;
1667 	u32 i, j;
1668 
1669 	if (XE_TEST_ONLY(test && test->force_gpu))
1670 		return ERR_PTR(-ETIME);
1671 
1672 	if (ops->pre_commit) {
1673 		pt_update->job = NULL;
1674 		err = ops->pre_commit(pt_update);
1675 		if (err)
1676 			return ERR_PTR(err);
1677 	}
1678 
1679 	for (i = 0; i < pt_update_ops->num_ops; ++i) {
1680 		const struct xe_vm_pgtable_update_op *pt_op =
1681 			&pt_update_ops->ops[i];
1682 
1683 		for (j = 0; j < pt_op->num_entries; j++) {
1684 			const struct xe_vm_pgtable_update *update =
1685 				&pt_op->entries[j];
1686 
1687 			if (pt_op->bind)
1688 				ops->populate(pt_update, m->tile,
1689 					      &update->pt_bo->vmap, NULL,
1690 					      update->ofs, update->qwords,
1691 					      update);
1692 			else
1693 				ops->clear(pt_update, m->tile,
1694 					   &update->pt_bo->vmap, NULL,
1695 					   update->ofs, update->qwords, update);
1696 		}
1697 	}
1698 
1699 	trace_xe_vm_cpu_bind(vm);
1700 	xe_device_wmb(vm->xe);
1701 
1702 	return dma_fence_get_stub();
1703 }
1704 
1705 static struct dma_fence *
__xe_migrate_update_pgtables(struct xe_migrate * m,struct xe_migrate_pt_update * pt_update,struct xe_vm_pgtable_update_ops * pt_update_ops)1706 __xe_migrate_update_pgtables(struct xe_migrate *m,
1707 			     struct xe_migrate_pt_update *pt_update,
1708 			     struct xe_vm_pgtable_update_ops *pt_update_ops)
1709 {
1710 	const struct xe_migrate_pt_update_ops *ops = pt_update->ops;
1711 	struct xe_tile *tile = m->tile;
1712 	struct xe_gt *gt = tile->primary_gt;
1713 	struct xe_device *xe = tile_to_xe(tile);
1714 	struct xe_sched_job *job;
1715 	struct dma_fence *fence;
1716 	struct drm_suballoc *sa_bo = NULL;
1717 	struct xe_bb *bb;
1718 	u32 i, j, batch_size = 0, ppgtt_ofs, update_idx, page_ofs = 0;
1719 	u32 num_updates = 0, current_update = 0;
1720 	u64 addr;
1721 	int err = 0;
1722 	bool is_migrate = pt_update_ops->q == m->q;
1723 	bool usm = is_migrate && xe->info.has_usm;
1724 
1725 	for (i = 0; i < pt_update_ops->num_ops; ++i) {
1726 		struct xe_vm_pgtable_update_op *pt_op = &pt_update_ops->ops[i];
1727 		struct xe_vm_pgtable_update *updates = pt_op->entries;
1728 
1729 		num_updates += pt_op->num_entries;
1730 		for (j = 0; j < pt_op->num_entries; ++j) {
1731 			u32 num_cmds = DIV_ROUND_UP(updates[j].qwords,
1732 						    MAX_PTE_PER_SDI);
1733 
1734 			/* align noop + MI_STORE_DATA_IMM cmd prefix */
1735 			batch_size += 4 * num_cmds + updates[j].qwords * 2;
1736 		}
1737 	}
1738 
1739 	/* fixed + PTE entries */
1740 	if (IS_DGFX(xe))
1741 		batch_size += 2;
1742 	else
1743 		batch_size += 6 * (num_updates / MAX_PTE_PER_SDI + 1) +
1744 			num_updates * 2;
1745 
1746 	bb = xe_bb_new(gt, batch_size, usm);
1747 	if (IS_ERR(bb))
1748 		return ERR_CAST(bb);
1749 
1750 	/* For sysmem PTE's, need to map them in our hole.. */
1751 	if (!IS_DGFX(xe)) {
1752 		u16 pat_index = xe->pat.idx[XE_CACHE_WB];
1753 		u32 ptes, ofs;
1754 
1755 		ppgtt_ofs = NUM_KERNEL_PDE - 1;
1756 		if (!is_migrate) {
1757 			u32 num_units = DIV_ROUND_UP(num_updates,
1758 						     NUM_VMUSA_WRITES_PER_UNIT);
1759 
1760 			if (num_units > m->vm_update_sa.size) {
1761 				err = -ENOBUFS;
1762 				goto err_bb;
1763 			}
1764 			sa_bo = drm_suballoc_new(&m->vm_update_sa, num_units,
1765 						 GFP_KERNEL, true, 0);
1766 			if (IS_ERR(sa_bo)) {
1767 				err = PTR_ERR(sa_bo);
1768 				goto err_bb;
1769 			}
1770 
1771 			ppgtt_ofs = NUM_KERNEL_PDE +
1772 				(drm_suballoc_soffset(sa_bo) /
1773 				 NUM_VMUSA_UNIT_PER_PAGE);
1774 			page_ofs = (drm_suballoc_soffset(sa_bo) %
1775 				    NUM_VMUSA_UNIT_PER_PAGE) *
1776 				VM_SA_UPDATE_UNIT_SIZE;
1777 		}
1778 
1779 		/* Map our PT's to gtt */
1780 		i = 0;
1781 		j = 0;
1782 		ptes = num_updates;
1783 		ofs = ppgtt_ofs * XE_PAGE_SIZE + page_ofs;
1784 		while (ptes) {
1785 			u32 chunk = min(MAX_PTE_PER_SDI, ptes);
1786 			u32 idx = 0;
1787 
1788 			bb->cs[bb->len++] = MI_STORE_DATA_IMM |
1789 				MI_SDI_NUM_QW(chunk);
1790 			bb->cs[bb->len++] = ofs;
1791 			bb->cs[bb->len++] = 0; /* upper_32_bits */
1792 
1793 			for (; i < pt_update_ops->num_ops; ++i) {
1794 				struct xe_vm_pgtable_update_op *pt_op =
1795 					&pt_update_ops->ops[i];
1796 				struct xe_vm_pgtable_update *updates = pt_op->entries;
1797 
1798 				for (; j < pt_op->num_entries; ++j, ++current_update, ++idx) {
1799 					struct xe_vm *vm = pt_update->vops->vm;
1800 					struct xe_bo *pt_bo = updates[j].pt_bo;
1801 
1802 					if (idx == chunk)
1803 						goto next_cmd;
1804 
1805 					xe_tile_assert(tile, xe_bo_size(pt_bo) == SZ_4K);
1806 
1807 					/* Map a PT at most once */
1808 					if (pt_bo->update_index < 0)
1809 						pt_bo->update_index = current_update;
1810 
1811 					addr = vm->pt_ops->pte_encode_bo(pt_bo, 0,
1812 									 pat_index, 0);
1813 					bb->cs[bb->len++] = lower_32_bits(addr);
1814 					bb->cs[bb->len++] = upper_32_bits(addr);
1815 				}
1816 
1817 				j = 0;
1818 			}
1819 
1820 next_cmd:
1821 			ptes -= chunk;
1822 			ofs += chunk * sizeof(u64);
1823 		}
1824 
1825 		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
1826 		update_idx = bb->len;
1827 
1828 		addr = xe_migrate_vm_addr(ppgtt_ofs, 0) +
1829 			(page_ofs / sizeof(u64)) * XE_PAGE_SIZE;
1830 		for (i = 0; i < pt_update_ops->num_ops; ++i) {
1831 			struct xe_vm_pgtable_update_op *pt_op =
1832 				&pt_update_ops->ops[i];
1833 			struct xe_vm_pgtable_update *updates = pt_op->entries;
1834 
1835 			for (j = 0; j < pt_op->num_entries; ++j) {
1836 				struct xe_bo *pt_bo = updates[j].pt_bo;
1837 
1838 				write_pgtable(tile, bb, addr +
1839 					      pt_bo->update_index * XE_PAGE_SIZE,
1840 					      pt_op, &updates[j], pt_update);
1841 			}
1842 		}
1843 	} else {
1844 		/* phys pages, no preamble required */
1845 		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
1846 		update_idx = bb->len;
1847 
1848 		for (i = 0; i < pt_update_ops->num_ops; ++i) {
1849 			struct xe_vm_pgtable_update_op *pt_op =
1850 				&pt_update_ops->ops[i];
1851 			struct xe_vm_pgtable_update *updates = pt_op->entries;
1852 
1853 			for (j = 0; j < pt_op->num_entries; ++j)
1854 				write_pgtable(tile, bb, 0, pt_op, &updates[j],
1855 					      pt_update);
1856 		}
1857 	}
1858 
1859 	job = xe_bb_create_migration_job(pt_update_ops->q, bb,
1860 					 xe_migrate_batch_base(m, usm),
1861 					 update_idx);
1862 	if (IS_ERR(job)) {
1863 		err = PTR_ERR(job);
1864 		goto err_sa;
1865 	}
1866 
1867 	xe_sched_job_add_migrate_flush(job, MI_INVALIDATE_TLB);
1868 
1869 	if (ops->pre_commit) {
1870 		pt_update->job = job;
1871 		err = ops->pre_commit(pt_update);
1872 		if (err)
1873 			goto err_job;
1874 	}
1875 	if (is_migrate)
1876 		mutex_lock(&m->job_mutex);
1877 
1878 	xe_sched_job_arm(job);
1879 	fence = dma_fence_get(&job->drm.s_fence->finished);
1880 	xe_sched_job_push(job);
1881 
1882 	if (is_migrate)
1883 		mutex_unlock(&m->job_mutex);
1884 
1885 	xe_bb_free(bb, fence);
1886 	drm_suballoc_free(sa_bo, fence);
1887 
1888 	return fence;
1889 
1890 err_job:
1891 	xe_sched_job_put(job);
1892 err_sa:
1893 	drm_suballoc_free(sa_bo, NULL);
1894 err_bb:
1895 	xe_bb_free(bb, NULL);
1896 	return ERR_PTR(err);
1897 }
1898 
1899 /**
1900  * xe_migrate_update_pgtables() - Pipelined page-table update
1901  * @m: The migrate context.
1902  * @pt_update: PT update arguments
1903  *
1904  * Perform a pipelined page-table update. The update descriptors are typically
1905  * built under the same lock critical section as a call to this function. If
1906  * using the default engine for the updates, they will be performed in the
1907  * order they grab the job_mutex. If different engines are used, external
1908  * synchronization is needed for overlapping updates to maintain page-table
1909  * consistency. Note that the meaning of "overlapping" is that the updates
1910  * touch the same page-table, which might be a higher-level page-directory.
1911  * If no pipelining is needed, then updates may be performed by the cpu.
1912  *
1913  * Return: A dma_fence that, when signaled, indicates the update completion.
1914  */
1915 struct dma_fence *
xe_migrate_update_pgtables(struct xe_migrate * m,struct xe_migrate_pt_update * pt_update)1916 xe_migrate_update_pgtables(struct xe_migrate *m,
1917 			   struct xe_migrate_pt_update *pt_update)
1918 
1919 {
1920 	struct xe_vm_pgtable_update_ops *pt_update_ops =
1921 		&pt_update->vops->pt_update_ops[pt_update->tile_id];
1922 	struct dma_fence *fence;
1923 
1924 	fence =  xe_migrate_update_pgtables_cpu(m, pt_update);
1925 
1926 	/* -ETIME indicates a job is needed, anything else is legit error */
1927 	if (!IS_ERR(fence) || PTR_ERR(fence) != -ETIME)
1928 		return fence;
1929 
1930 	return __xe_migrate_update_pgtables(m, pt_update, pt_update_ops);
1931 }
1932 
1933 /**
1934  * xe_migrate_wait() - Complete all operations using the xe_migrate context
1935  * @m: Migrate context to wait for.
1936  *
1937  * Waits until the GPU no longer uses the migrate context's default engine
1938  * or its page-table objects. FIXME: What about separate page-table update
1939  * engines?
1940  */
xe_migrate_wait(struct xe_migrate * m)1941 void xe_migrate_wait(struct xe_migrate *m)
1942 {
1943 	if (m->fence)
1944 		dma_fence_wait(m->fence, false);
1945 }
1946 
pte_update_cmd_size(u64 size)1947 static u32 pte_update_cmd_size(u64 size)
1948 {
1949 	u32 num_dword;
1950 	u64 entries = DIV_U64_ROUND_UP(size, XE_PAGE_SIZE);
1951 
1952 	XE_WARN_ON(size > MAX_PREEMPTDISABLE_TRANSFER);
1953 
1954 	/*
1955 	 * MI_STORE_DATA_IMM command is used to update page table. Each
1956 	 * instruction can update maximumly MAX_PTE_PER_SDI pte entries. To
1957 	 * update n (n <= MAX_PTE_PER_SDI) pte entries, we need:
1958 	 *
1959 	 * - 1 dword for the MI_STORE_DATA_IMM command header (opcode etc)
1960 	 * - 2 dword for the page table's physical location
1961 	 * - 2*n dword for value of pte to fill (each pte entry is 2 dwords)
1962 	 */
1963 	num_dword = (1 + 2) * DIV_U64_ROUND_UP(entries, MAX_PTE_PER_SDI);
1964 	num_dword += entries * 2;
1965 
1966 	return num_dword;
1967 }
1968 
build_pt_update_batch_sram(struct xe_migrate * m,struct xe_bb * bb,u32 pt_offset,struct drm_pagemap_addr * sram_addr,u32 size,int level)1969 static void build_pt_update_batch_sram(struct xe_migrate *m,
1970 				       struct xe_bb *bb, u32 pt_offset,
1971 				       struct drm_pagemap_addr *sram_addr,
1972 				       u32 size, int level)
1973 {
1974 	u16 pat_index = tile_to_xe(m->tile)->pat.idx[XE_CACHE_WB];
1975 	u64 gpu_page_size = 0x1ull << xe_pt_shift(level);
1976 	u32 ptes;
1977 	int i = 0;
1978 
1979 	xe_tile_assert(m->tile, PAGE_ALIGNED(size));
1980 
1981 	ptes = DIV_ROUND_UP(size, gpu_page_size);
1982 	while (ptes) {
1983 		u32 chunk = min(MAX_PTE_PER_SDI, ptes);
1984 
1985 		if (!level)
1986 			chunk = ALIGN_DOWN(chunk, PAGE_SIZE / XE_PAGE_SIZE);
1987 
1988 		bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk);
1989 		bb->cs[bb->len++] = pt_offset;
1990 		bb->cs[bb->len++] = 0;
1991 
1992 		pt_offset += chunk * 8;
1993 		ptes -= chunk;
1994 
1995 		while (chunk--) {
1996 			u64 addr = sram_addr[i].addr;
1997 			u64 pte;
1998 
1999 			xe_tile_assert(m->tile, sram_addr[i].proto ==
2000 				       DRM_INTERCONNECT_SYSTEM);
2001 			xe_tile_assert(m->tile, addr);
2002 			xe_tile_assert(m->tile, PAGE_ALIGNED(addr));
2003 
2004 again:
2005 			pte = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe,
2006 								addr, pat_index,
2007 								level, false, 0);
2008 			bb->cs[bb->len++] = lower_32_bits(pte);
2009 			bb->cs[bb->len++] = upper_32_bits(pte);
2010 
2011 			if (gpu_page_size < PAGE_SIZE) {
2012 				addr += XE_PAGE_SIZE;
2013 				if (!PAGE_ALIGNED(addr)) {
2014 					chunk--;
2015 					goto again;
2016 				}
2017 				i++;
2018 			} else {
2019 				i += gpu_page_size / PAGE_SIZE;
2020 			}
2021 		}
2022 	}
2023 }
2024 
xe_migrate_vram_use_pde(struct drm_pagemap_addr * sram_addr,unsigned long size)2025 static bool xe_migrate_vram_use_pde(struct drm_pagemap_addr *sram_addr,
2026 				    unsigned long size)
2027 {
2028 	u32 large_size = (0x1 << xe_pt_shift(1));
2029 	unsigned long i, incr = large_size / PAGE_SIZE;
2030 
2031 	for (i = 0; i < DIV_ROUND_UP(size, PAGE_SIZE); i += incr)
2032 		if (PAGE_SIZE << sram_addr[i].order != large_size)
2033 			return false;
2034 
2035 	return true;
2036 }
2037 
2038 #define XE_CACHELINE_BYTES	64ull
2039 #define XE_CACHELINE_MASK	(XE_CACHELINE_BYTES - 1)
2040 
xe_migrate_copy_pitch(struct xe_device * xe,u32 len)2041 static u32 xe_migrate_copy_pitch(struct xe_device *xe, u32 len)
2042 {
2043 	u32 pitch;
2044 
2045 	if (IS_ALIGNED(len, PAGE_SIZE))
2046 		pitch = PAGE_SIZE;
2047 	else if (IS_ALIGNED(len, SZ_4K))
2048 		pitch = SZ_4K;
2049 	else if (IS_ALIGNED(len, SZ_256))
2050 		pitch = SZ_256;
2051 	else if (IS_ALIGNED(len, 4))
2052 		pitch = 4;
2053 	else
2054 		pitch = 1;
2055 
2056 	xe_assert(xe, pitch > 1 || xe->info.has_mem_copy_instr);
2057 	return pitch;
2058 }
2059 
xe_migrate_vram(struct xe_migrate * m,unsigned long len,unsigned long sram_offset,struct drm_pagemap_addr * sram_addr,u64 vram_addr,const enum xe_migrate_copy_dir dir)2060 static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
2061 					 unsigned long len,
2062 					 unsigned long sram_offset,
2063 					 struct drm_pagemap_addr *sram_addr,
2064 					 u64 vram_addr,
2065 					 const enum xe_migrate_copy_dir dir)
2066 {
2067 	struct xe_gt *gt = m->tile->primary_gt;
2068 	struct xe_device *xe = gt_to_xe(gt);
2069 	bool use_usm_batch = xe->info.has_usm;
2070 	struct dma_fence *fence = NULL;
2071 	u32 batch_size = 1;
2072 	u64 src_L0_ofs, dst_L0_ofs;
2073 	struct xe_sched_job *job;
2074 	struct xe_bb *bb;
2075 	u32 update_idx, pt_slot = 0;
2076 	unsigned long npages = DIV_ROUND_UP(len + sram_offset, PAGE_SIZE);
2077 	unsigned int pitch = xe_migrate_copy_pitch(xe, len);
2078 	int err;
2079 	unsigned long i, j;
2080 	bool use_pde = xe_migrate_vram_use_pde(sram_addr, len + sram_offset);
2081 
2082 	if (!xe->info.has_mem_copy_instr &&
2083 	    drm_WARN_ON(&xe->drm,
2084 			(!IS_ALIGNED(len, pitch)) || (sram_offset | vram_addr) & XE_CACHELINE_MASK))
2085 		return ERR_PTR(-EOPNOTSUPP);
2086 
2087 	xe_assert(xe, npages * PAGE_SIZE <= MAX_PREEMPTDISABLE_TRANSFER);
2088 
2089 	batch_size += pte_update_cmd_size(npages << PAGE_SHIFT);
2090 	batch_size += EMIT_COPY_DW;
2091 
2092 	bb = xe_bb_new(gt, batch_size, use_usm_batch);
2093 	if (IS_ERR(bb)) {
2094 		err = PTR_ERR(bb);
2095 		return ERR_PTR(err);
2096 	}
2097 
2098 	/*
2099 	 * If the order of a struct drm_pagemap_addr entry is greater than 0,
2100 	 * the entry is populated by GPU pagemap but subsequent entries within
2101 	 * the range of that order are not populated.
2102 	 * build_pt_update_batch_sram() expects a fully populated array of
2103 	 * struct drm_pagemap_addr. Ensure this is the case even with higher
2104 	 * orders.
2105 	 */
2106 	for (i = 0; !use_pde && i < npages;) {
2107 		unsigned int order = sram_addr[i].order;
2108 
2109 		for (j = 1; j < NR_PAGES(order) && i + j < npages; j++)
2110 			if (!sram_addr[i + j].addr)
2111 				sram_addr[i + j].addr = sram_addr[i].addr + j * PAGE_SIZE;
2112 
2113 		i += NR_PAGES(order);
2114 	}
2115 
2116 	if (use_pde)
2117 		build_pt_update_batch_sram(m, bb, m->large_page_copy_pdes,
2118 					   sram_addr, npages << PAGE_SHIFT, 1);
2119 	else
2120 		build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE,
2121 					   sram_addr, npages << PAGE_SHIFT, 0);
2122 
2123 	if (dir == XE_MIGRATE_COPY_TO_VRAM) {
2124 		if (use_pde)
2125 			src_L0_ofs = m->large_page_copy_ofs + sram_offset;
2126 		else
2127 			src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
2128 		dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
2129 
2130 	} else {
2131 		src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
2132 		if (use_pde)
2133 			dst_L0_ofs = m->large_page_copy_ofs + sram_offset;
2134 		else
2135 			dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
2136 	}
2137 
2138 	bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
2139 	update_idx = bb->len;
2140 
2141 	emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, len, pitch);
2142 
2143 	job = xe_bb_create_migration_job(m->q, bb,
2144 					 xe_migrate_batch_base(m, use_usm_batch),
2145 					 update_idx);
2146 	if (IS_ERR(job)) {
2147 		err = PTR_ERR(job);
2148 		goto err;
2149 	}
2150 
2151 	xe_sched_job_add_migrate_flush(job, MI_INVALIDATE_TLB);
2152 
2153 	mutex_lock(&m->job_mutex);
2154 	xe_sched_job_arm(job);
2155 	fence = dma_fence_get(&job->drm.s_fence->finished);
2156 	xe_sched_job_push(job);
2157 
2158 	dma_fence_put(m->fence);
2159 	m->fence = dma_fence_get(fence);
2160 	mutex_unlock(&m->job_mutex);
2161 
2162 	xe_bb_free(bb, fence);
2163 
2164 	return fence;
2165 
2166 err:
2167 	xe_bb_free(bb, NULL);
2168 
2169 	return ERR_PTR(err);
2170 }
2171 
2172 /**
2173  * xe_migrate_to_vram() - Migrate to VRAM
2174  * @m: The migration context.
2175  * @npages: Number of pages to migrate.
2176  * @src_addr: Array of DMA information (source of migrate)
2177  * @dst_addr: Device physical address of VRAM (destination of migrate)
2178  *
2179  * Copy from an array dma addresses to a VRAM device physical address
2180  *
2181  * Return: dma fence for migrate to signal completion on success, ERR_PTR on
2182  * failure
2183  */
xe_migrate_to_vram(struct xe_migrate * m,unsigned long npages,struct drm_pagemap_addr * src_addr,u64 dst_addr)2184 struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m,
2185 				     unsigned long npages,
2186 				     struct drm_pagemap_addr *src_addr,
2187 				     u64 dst_addr)
2188 {
2189 	return xe_migrate_vram(m, npages * PAGE_SIZE, 0, src_addr, dst_addr,
2190 			       XE_MIGRATE_COPY_TO_VRAM);
2191 }
2192 
2193 /**
2194  * xe_migrate_from_vram() - Migrate from VRAM
2195  * @m: The migration context.
2196  * @npages: Number of pages to migrate.
2197  * @src_addr: Device physical address of VRAM (source of migrate)
2198  * @dst_addr: Array of DMA information (destination of migrate)
2199  *
2200  * Copy from a VRAM device physical address to an array dma addresses
2201  *
2202  * Return: dma fence for migrate to signal completion on success, ERR_PTR on
2203  * failure
2204  */
xe_migrate_from_vram(struct xe_migrate * m,unsigned long npages,u64 src_addr,struct drm_pagemap_addr * dst_addr)2205 struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m,
2206 				       unsigned long npages,
2207 				       u64 src_addr,
2208 				       struct drm_pagemap_addr *dst_addr)
2209 {
2210 	return xe_migrate_vram(m, npages * PAGE_SIZE, 0, dst_addr, src_addr,
2211 			       XE_MIGRATE_COPY_TO_SRAM);
2212 }
2213 
xe_migrate_dma_unmap(struct xe_device * xe,struct drm_pagemap_addr * pagemap_addr,int len,int write)2214 static void xe_migrate_dma_unmap(struct xe_device *xe,
2215 				 struct drm_pagemap_addr *pagemap_addr,
2216 				 int len, int write)
2217 {
2218 	unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
2219 
2220 	for (i = 0; i < npages; ++i) {
2221 		if (!pagemap_addr[i].addr)
2222 			break;
2223 
2224 		dma_unmap_page(xe->drm.dev, pagemap_addr[i].addr, PAGE_SIZE,
2225 			       write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
2226 	}
2227 	kfree(pagemap_addr);
2228 }
2229 
xe_migrate_dma_map(struct xe_device * xe,void * buf,int len,int write)2230 static struct drm_pagemap_addr *xe_migrate_dma_map(struct xe_device *xe,
2231 						   void *buf, int len,
2232 						   int write)
2233 {
2234 	struct drm_pagemap_addr *pagemap_addr;
2235 	unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
2236 
2237 	pagemap_addr = kcalloc(npages, sizeof(*pagemap_addr), GFP_KERNEL);
2238 	if (!pagemap_addr)
2239 		return ERR_PTR(-ENOMEM);
2240 
2241 	for (i = 0; i < npages; ++i) {
2242 		dma_addr_t addr;
2243 		struct page *page;
2244 		enum dma_data_direction dir = write ? DMA_TO_DEVICE :
2245 						      DMA_FROM_DEVICE;
2246 
2247 		if (is_vmalloc_addr(buf))
2248 			page = vmalloc_to_page(buf);
2249 		else
2250 			page = virt_to_page(buf);
2251 
2252 		addr = dma_map_page(xe->drm.dev, page, 0, PAGE_SIZE, dir);
2253 		if (dma_mapping_error(xe->drm.dev, addr))
2254 			goto err_fault;
2255 
2256 		pagemap_addr[i] =
2257 			drm_pagemap_addr_encode(addr,
2258 						DRM_INTERCONNECT_SYSTEM,
2259 						0, dir);
2260 		buf += PAGE_SIZE;
2261 	}
2262 
2263 	return pagemap_addr;
2264 
2265 err_fault:
2266 	xe_migrate_dma_unmap(xe, pagemap_addr, len, write);
2267 	return ERR_PTR(-EFAULT);
2268 }
2269 
2270 /**
2271  * xe_migrate_access_memory - Access memory of a BO via GPU
2272  *
2273  * @m: The migration context.
2274  * @bo: buffer object
2275  * @offset: access offset into buffer object
2276  * @buf: pointer to caller memory to read into or write from
2277  * @len: length of access
2278  * @write: write access
2279  *
2280  * Access memory of a BO via GPU either reading in or writing from a passed in
2281  * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to
2282  * read to or write from pointer.
2283  *
2284  * Returns:
2285  * 0 if successful, negative error code on failure.
2286  */
xe_migrate_access_memory(struct xe_migrate * m,struct xe_bo * bo,unsigned long offset,void * buf,int len,int write)2287 int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
2288 			     unsigned long offset, void *buf, int len,
2289 			     int write)
2290 {
2291 	struct xe_tile *tile = m->tile;
2292 	struct xe_device *xe = tile_to_xe(tile);
2293 	struct xe_res_cursor cursor;
2294 	struct dma_fence *fence = NULL;
2295 	struct drm_pagemap_addr *pagemap_addr;
2296 	unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK;
2297 	int bytes_left = len, current_page = 0;
2298 	void *orig_buf = buf;
2299 
2300 	xe_bo_assert_held(bo);
2301 
2302 	/* Use bounce buffer for small access and unaligned access */
2303 	if (!xe->info.has_mem_copy_instr &&
2304 	    (!IS_ALIGNED(len, 4) ||
2305 	     !IS_ALIGNED(page_offset, XE_CACHELINE_BYTES) ||
2306 	     !IS_ALIGNED(offset, XE_CACHELINE_BYTES))) {
2307 		int buf_offset = 0;
2308 		void *bounce;
2309 		int err;
2310 
2311 		BUILD_BUG_ON(!is_power_of_2(XE_CACHELINE_BYTES));
2312 		bounce = kmalloc(XE_CACHELINE_BYTES, GFP_KERNEL);
2313 		if (!bounce)
2314 			return -ENOMEM;
2315 
2316 		/*
2317 		 * Less than ideal for large unaligned access but this should be
2318 		 * fairly rare, can fixup if this becomes common.
2319 		 */
2320 		do {
2321 			int copy_bytes = min_t(int, bytes_left,
2322 					       XE_CACHELINE_BYTES -
2323 					       (offset & XE_CACHELINE_MASK));
2324 			int ptr_offset = offset & XE_CACHELINE_MASK;
2325 
2326 			err = xe_migrate_access_memory(m, bo,
2327 						       offset &
2328 						       ~XE_CACHELINE_MASK,
2329 						       bounce,
2330 						       XE_CACHELINE_BYTES, 0);
2331 			if (err)
2332 				break;
2333 
2334 			if (write) {
2335 				memcpy(bounce + ptr_offset, buf + buf_offset, copy_bytes);
2336 
2337 				err = xe_migrate_access_memory(m, bo,
2338 							       offset & ~XE_CACHELINE_MASK,
2339 							       bounce,
2340 							       XE_CACHELINE_BYTES, write);
2341 				if (err)
2342 					break;
2343 			} else {
2344 				memcpy(buf + buf_offset, bounce + ptr_offset,
2345 				       copy_bytes);
2346 			}
2347 
2348 			bytes_left -= copy_bytes;
2349 			buf_offset += copy_bytes;
2350 			offset += copy_bytes;
2351 		} while (bytes_left);
2352 
2353 		kfree(bounce);
2354 		return err;
2355 	}
2356 
2357 	pagemap_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write);
2358 	if (IS_ERR(pagemap_addr))
2359 		return PTR_ERR(pagemap_addr);
2360 
2361 	xe_res_first(bo->ttm.resource, offset, xe_bo_size(bo) - offset, &cursor);
2362 
2363 	do {
2364 		struct dma_fence *__fence;
2365 		u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) +
2366 			cursor.start;
2367 		int current_bytes;
2368 		u32 pitch;
2369 
2370 		if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER)
2371 			current_bytes = min_t(int, bytes_left,
2372 					      MAX_PREEMPTDISABLE_TRANSFER);
2373 		else
2374 			current_bytes = min_t(int, bytes_left, cursor.size);
2375 
2376 		pitch = xe_migrate_copy_pitch(xe, current_bytes);
2377 		if (xe->info.has_mem_copy_instr)
2378 			current_bytes = min_t(int, current_bytes, U16_MAX * pitch);
2379 		else
2380 			current_bytes = min_t(int, current_bytes,
2381 					      round_down(S16_MAX * pitch,
2382 							 XE_CACHELINE_BYTES));
2383 
2384 		__fence = xe_migrate_vram(m, current_bytes,
2385 					  (unsigned long)buf & ~PAGE_MASK,
2386 					  &pagemap_addr[current_page],
2387 					  vram_addr, write ?
2388 					  XE_MIGRATE_COPY_TO_VRAM :
2389 					  XE_MIGRATE_COPY_TO_SRAM);
2390 		if (IS_ERR(__fence)) {
2391 			if (fence) {
2392 				dma_fence_wait(fence, false);
2393 				dma_fence_put(fence);
2394 			}
2395 			fence = __fence;
2396 			goto out_err;
2397 		}
2398 
2399 		dma_fence_put(fence);
2400 		fence = __fence;
2401 
2402 		buf += current_bytes;
2403 		offset += current_bytes;
2404 		current_page = (int)(buf - orig_buf) / PAGE_SIZE;
2405 		bytes_left -= current_bytes;
2406 		if (bytes_left)
2407 			xe_res_next(&cursor, current_bytes);
2408 	} while (bytes_left);
2409 
2410 	dma_fence_wait(fence, false);
2411 	dma_fence_put(fence);
2412 
2413 out_err:
2414 	xe_migrate_dma_unmap(xe, pagemap_addr, len + page_offset, write);
2415 	return IS_ERR(fence) ? PTR_ERR(fence) : 0;
2416 }
2417 
2418 /**
2419  * xe_migrate_job_lock() - Lock migrate job lock
2420  * @m: The migration context.
2421  * @q: Queue associated with the operation which requires a lock
2422  *
2423  * Lock the migrate job lock if the queue is a migration queue, otherwise
2424  * assert the VM's dma-resv is held (user queue's have own locking).
2425  */
xe_migrate_job_lock(struct xe_migrate * m,struct xe_exec_queue * q)2426 void xe_migrate_job_lock(struct xe_migrate *m, struct xe_exec_queue *q)
2427 {
2428 	bool is_migrate = q == m->q;
2429 
2430 	if (is_migrate)
2431 		mutex_lock(&m->job_mutex);
2432 	else
2433 		xe_vm_assert_held(q->vm);	/* User queues VM's should be locked */
2434 }
2435 
2436 /**
2437  * xe_migrate_job_unlock() - Unlock migrate job lock
2438  * @m: The migration context.
2439  * @q: Queue associated with the operation which requires a lock
2440  *
2441  * Unlock the migrate job lock if the queue is a migration queue, otherwise
2442  * assert the VM's dma-resv is held (user queue's have own locking).
2443  */
xe_migrate_job_unlock(struct xe_migrate * m,struct xe_exec_queue * q)2444 void xe_migrate_job_unlock(struct xe_migrate *m, struct xe_exec_queue *q)
2445 {
2446 	bool is_migrate = q == m->q;
2447 
2448 	if (is_migrate)
2449 		mutex_unlock(&m->job_mutex);
2450 	else
2451 		xe_vm_assert_held(q->vm);	/* User queues VM's should be locked */
2452 }
2453 
2454 #if IS_ENABLED(CONFIG_PROVE_LOCKING)
2455 /**
2456  * xe_migrate_job_lock_assert() - Assert migrate job lock held of queue
2457  * @q: Migrate queue
2458  */
xe_migrate_job_lock_assert(struct xe_exec_queue * q)2459 void xe_migrate_job_lock_assert(struct xe_exec_queue *q)
2460 {
2461 	struct xe_migrate *m = gt_to_tile(q->gt)->migrate;
2462 
2463 	xe_gt_assert(q->gt, q == m->q);
2464 	lockdep_assert_held(&m->job_mutex);
2465 }
2466 #endif
2467 
2468 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
2469 #include "tests/xe_migrate.c"
2470 #endif
2471