xref: /linux/drivers/gpu/drm/xe/xe_migrate.c (revision dd08ebf6c3525a7ea2186e636df064ea47281987)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2020 Intel Corporation
4  */
5 #include "xe_migrate.h"
6 
7 #include "xe_bb.h"
8 #include "xe_bo.h"
9 #include "xe_engine.h"
10 #include "xe_ggtt.h"
11 #include "xe_gt.h"
12 #include "xe_hw_engine.h"
13 #include "xe_lrc.h"
14 #include "xe_map.h"
15 #include "xe_mocs.h"
16 #include "xe_pt.h"
17 #include "xe_res_cursor.h"
18 #include "xe_sched_job.h"
19 #include "xe_sync.h"
20 #include "xe_trace.h"
21 #include "xe_vm.h"
22 
23 #include <linux/sizes.h>
24 #include <drm/drm_managed.h>
25 #include <drm/ttm/ttm_tt.h>
26 #include <drm/xe_drm.h>
27 
28 #include "gt/intel_gpu_commands.h"
29 
30 struct xe_migrate {
31 	struct xe_engine *eng;
32 	struct xe_gt *gt;
33 	struct mutex job_mutex;
34 	struct xe_bo *pt_bo;
35 	struct xe_bo *cleared_bo;
36 	u64 batch_base_ofs;
37 	u64 usm_batch_base_ofs;
38 	u64 cleared_vram_ofs;
39 	struct dma_fence *fence;
40 	struct drm_suballoc_manager vm_update_sa;
41 };
42 
43 #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */
44 #define NUM_KERNEL_PDE 17
45 #define NUM_PT_SLOTS 32
46 #define NUM_PT_PER_BLIT (MAX_PREEMPTDISABLE_TRANSFER / SZ_2M)
47 
48 struct xe_engine *xe_gt_migrate_engine(struct xe_gt *gt)
49 {
50 	return gt->migrate->eng;
51 }
52 
53 static void xe_migrate_fini(struct drm_device *dev, void *arg)
54 {
55 	struct xe_migrate *m = arg;
56 	struct ww_acquire_ctx ww;
57 
58 	xe_vm_lock(m->eng->vm, &ww, 0, false);
59 	xe_bo_unpin(m->pt_bo);
60 	if (m->cleared_bo)
61 		xe_bo_unpin(m->cleared_bo);
62 	xe_vm_unlock(m->eng->vm, &ww);
63 
64 	dma_fence_put(m->fence);
65 	if (m->cleared_bo)
66 		xe_bo_put(m->cleared_bo);
67 	xe_bo_put(m->pt_bo);
68 	drm_suballoc_manager_fini(&m->vm_update_sa);
69 	mutex_destroy(&m->job_mutex);
70 	xe_vm_close_and_put(m->eng->vm);
71 	xe_engine_put(m->eng);
72 }
73 
74 static u64 xe_migrate_vm_addr(u64 slot, u32 level)
75 {
76 	XE_BUG_ON(slot >= NUM_PT_SLOTS);
77 
78 	/* First slot is reserved for mapping of PT bo and bb, start from 1 */
79 	return (slot + 1ULL) << xe_pt_shift(level + 1);
80 }
81 
82 static u64 xe_migrate_vram_ofs(u64 addr)
83 {
84 	return addr + (256ULL << xe_pt_shift(2));
85 }
86 
87 /*
88  * For flat CCS clearing we need a cleared chunk of memory to copy from,
89  * since the CCS clearing mode of XY_FAST_COLOR_BLT appears to be buggy
90  * (it clears on only 14 bytes in each chunk of 16).
91  * If clearing the main surface one can use the part of the main surface
92  * already cleared, but for clearing as part of copying non-compressed
93  * data out of system memory, we don't readily have a cleared part of
94  * VRAM to copy from, so create one to use for that case.
95  */
96 static int xe_migrate_create_cleared_bo(struct xe_migrate *m, struct xe_vm *vm)
97 {
98 	struct xe_gt *gt = m->gt;
99 	struct xe_device *xe = vm->xe;
100 	size_t cleared_size;
101 	u64 vram_addr;
102 	bool is_vram;
103 
104 	if (!xe_device_has_flat_ccs(xe))
105 		return 0;
106 
107 	cleared_size = xe_device_ccs_bytes(xe, MAX_PREEMPTDISABLE_TRANSFER);
108 	cleared_size = PAGE_ALIGN(cleared_size);
109 	m->cleared_bo = xe_bo_create_pin_map(xe, gt, vm, cleared_size,
110 					     ttm_bo_type_kernel,
111 					     XE_BO_CREATE_VRAM_IF_DGFX(gt) |
112 					     XE_BO_CREATE_PINNED_BIT);
113 	if (IS_ERR(m->cleared_bo))
114 		return PTR_ERR(m->cleared_bo);
115 
116 	xe_map_memset(xe, &m->cleared_bo->vmap, 0, 0x00, cleared_size);
117 	vram_addr = xe_bo_addr(m->cleared_bo, 0, GEN8_PAGE_SIZE, &is_vram);
118 	XE_BUG_ON(!is_vram);
119 	m->cleared_vram_ofs = xe_migrate_vram_ofs(vram_addr);
120 
121 	return 0;
122 }
123 
124 static int xe_migrate_prepare_vm(struct xe_gt *gt, struct xe_migrate *m,
125 				 struct xe_vm *vm)
126 {
127 	u8 id = gt->info.id;
128 	u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level;
129 	u32 map_ofs, level, i;
130 	struct xe_device *xe = gt_to_xe(m->gt);
131 	struct xe_bo *bo, *batch = gt->kernel_bb_pool.bo;
132 	u64 entry;
133 	int ret;
134 
135 	/* Can't bump NUM_PT_SLOTS too high */
136 	BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/GEN8_PAGE_SIZE);
137 	/* Must be a multiple of 64K to support all platforms */
138 	BUILD_BUG_ON(NUM_PT_SLOTS * GEN8_PAGE_SIZE % SZ_64K);
139 	/* And one slot reserved for the 4KiB page table updates */
140 	BUILD_BUG_ON(!(NUM_KERNEL_PDE & 1));
141 
142 	/* Need to be sure everything fits in the first PT, or create more */
143 	XE_BUG_ON(m->batch_base_ofs + batch->size >= SZ_2M);
144 
145 	bo = xe_bo_create_pin_map(vm->xe, m->gt, vm,
146 				  num_entries * GEN8_PAGE_SIZE,
147 				  ttm_bo_type_kernel,
148 				  XE_BO_CREATE_VRAM_IF_DGFX(m->gt) |
149 				  XE_BO_CREATE_PINNED_BIT);
150 	if (IS_ERR(bo))
151 		return PTR_ERR(bo);
152 
153 	ret = xe_migrate_create_cleared_bo(m, vm);
154 	if (ret) {
155 		xe_bo_put(bo);
156 		return ret;
157 	}
158 
159 	entry = gen8_pde_encode(bo, bo->size - GEN8_PAGE_SIZE, XE_CACHE_WB);
160 	xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry);
161 
162 	map_ofs = (num_entries - num_level) * GEN8_PAGE_SIZE;
163 
164 	/* Map the entire BO in our level 0 pt */
165 	for (i = 0, level = 0; i < num_entries; level++) {
166 		entry = gen8_pte_encode(NULL, bo, i * GEN8_PAGE_SIZE,
167 					XE_CACHE_WB, 0, 0);
168 
169 		xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, entry);
170 
171 		if (vm->flags & XE_VM_FLAGS_64K)
172 			i += 16;
173 		else
174 			i += 1;
175 	}
176 
177 	if (!IS_DGFX(xe)) {
178 		XE_BUG_ON(xe->info.supports_usm);
179 
180 		/* Write out batch too */
181 		m->batch_base_ofs = NUM_PT_SLOTS * GEN8_PAGE_SIZE;
182 		for (i = 0; i < batch->size;
183 		     i += vm->flags & XE_VM_FLAGS_64K ? GEN8_64K_PAGE_SIZE :
184 			     GEN8_PAGE_SIZE) {
185 			entry = gen8_pte_encode(NULL, batch, i,
186 						XE_CACHE_WB, 0, 0);
187 
188 			xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64,
189 				  entry);
190 			level++;
191 		}
192 	} else {
193 		bool is_lmem;
194 		u64 batch_addr = xe_bo_addr(batch, 0, GEN8_PAGE_SIZE, &is_lmem);
195 
196 		m->batch_base_ofs = xe_migrate_vram_ofs(batch_addr);
197 
198 		if (xe->info.supports_usm) {
199 			batch = gt->usm.bb_pool.bo;
200 			batch_addr = xe_bo_addr(batch, 0, GEN8_PAGE_SIZE,
201 						&is_lmem);
202 			m->usm_batch_base_ofs = xe_migrate_vram_ofs(batch_addr);
203 		}
204 	}
205 
206 	for (level = 1; level < num_level; level++) {
207 		u32 flags = 0;
208 
209 		if (vm->flags & XE_VM_FLAGS_64K && level == 1)
210 			flags = GEN12_PDE_64K;
211 
212 		entry = gen8_pde_encode(bo, map_ofs + (level - 1) *
213 					GEN8_PAGE_SIZE, XE_CACHE_WB);
214 		xe_map_wr(xe, &bo->vmap, map_ofs + GEN8_PAGE_SIZE * level, u64,
215 			  entry | flags);
216 	}
217 
218 	/* Write PDE's that point to our BO. */
219 	for (i = 0; i < num_entries - num_level; i++) {
220 		entry = gen8_pde_encode(bo, i * GEN8_PAGE_SIZE,
221 					XE_CACHE_WB);
222 
223 		xe_map_wr(xe, &bo->vmap, map_ofs + GEN8_PAGE_SIZE +
224 			  (i + 1) * 8, u64, entry);
225 	}
226 
227 	/* Identity map the entire vram at 256GiB offset */
228 	if (IS_DGFX(xe)) {
229 		u64 pos, ofs, flags;
230 
231 		level = 2;
232 		ofs = map_ofs + GEN8_PAGE_SIZE * level + 256 * 8;
233 		flags = GEN8_PAGE_RW | GEN8_PAGE_PRESENT | PPAT_CACHED |
234 			GEN12_PPGTT_PTE_LM | GEN8_PDPE_PS_1G;
235 
236 		/*
237 		 * Use 1GB pages, it shouldn't matter the physical amount of
238 		 * vram is less, when we don't access it.
239 		 */
240 		for (pos = 0; pos < xe->mem.vram.size; pos += SZ_1G, ofs += 8)
241 			xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags);
242 	}
243 
244 	/*
245 	 * Example layout created above, with root level = 3:
246 	 * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's
247 	 * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's
248 	 * [PT9...PT28]: Userspace PT's for VM_BIND, 4 KiB PTE's
249 	 * [PT29 = PDE 0] [PT30 = PDE 1] [PT31 = PDE 2]
250 	 *
251 	 * This makes the lowest part of the VM point to the pagetables.
252 	 * Hence the lowest 2M in the vm should point to itself, with a few writes
253 	 * and flushes, other parts of the VM can be used either for copying and
254 	 * clearing.
255 	 *
256 	 * For performance, the kernel reserves PDE's, so about 20 are left
257 	 * for async VM updates.
258 	 *
259 	 * To make it easier to work, each scratch PT is put in slot (1 + PT #)
260 	 * everywhere, this allows lockless updates to scratch pages by using
261 	 * the different addresses in VM.
262 	 */
263 #define NUM_VMUSA_UNIT_PER_PAGE	32
264 #define VM_SA_UPDATE_UNIT_SIZE	(GEN8_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE)
265 #define NUM_VMUSA_WRITES_PER_UNIT	(VM_SA_UPDATE_UNIT_SIZE / sizeof(u64))
266 	drm_suballoc_manager_init(&m->vm_update_sa,
267 				  (map_ofs / GEN8_PAGE_SIZE - NUM_KERNEL_PDE) *
268 				  NUM_VMUSA_UNIT_PER_PAGE, 0);
269 
270 	m->pt_bo = bo;
271 	return 0;
272 }
273 
274 struct xe_migrate *xe_migrate_init(struct xe_gt *gt)
275 {
276 	struct xe_device *xe = gt_to_xe(gt);
277 	struct xe_migrate *m;
278 	struct xe_vm *vm;
279 	struct ww_acquire_ctx ww;
280 	int err;
281 
282 	XE_BUG_ON(xe_gt_is_media_type(gt));
283 
284 	m = drmm_kzalloc(&xe->drm, sizeof(*m), GFP_KERNEL);
285 	if (!m)
286 		return ERR_PTR(-ENOMEM);
287 
288 	m->gt = gt;
289 
290 	/* Special layout, prepared below.. */
291 	vm = xe_vm_create(xe, XE_VM_FLAG_MIGRATION |
292 			  XE_VM_FLAG_SET_GT_ID(gt));
293 	if (IS_ERR(vm))
294 		return ERR_CAST(vm);
295 
296 	xe_vm_lock(vm, &ww, 0, false);
297 	err = xe_migrate_prepare_vm(gt, m, vm);
298 	xe_vm_unlock(vm, &ww);
299 	if (err) {
300 		xe_vm_close_and_put(vm);
301 		return ERR_PTR(err);
302 	}
303 
304 	if (xe->info.supports_usm) {
305 		struct xe_hw_engine *hwe = xe_gt_hw_engine(gt,
306 							   XE_ENGINE_CLASS_COPY,
307 							   gt->usm.reserved_bcs_instance,
308 							   false);
309 		if (!hwe)
310 			return ERR_PTR(-EINVAL);
311 
312 		m->eng = xe_engine_create(xe, vm,
313 					  BIT(hwe->logical_instance), 1,
314 					  hwe, ENGINE_FLAG_KERNEL);
315 	} else {
316 		m->eng = xe_engine_create_class(xe, gt, vm,
317 						XE_ENGINE_CLASS_COPY,
318 						ENGINE_FLAG_KERNEL);
319 	}
320 	if (IS_ERR(m->eng)) {
321 		xe_vm_close_and_put(vm);
322 		return ERR_CAST(m->eng);
323 	}
324 
325 	mutex_init(&m->job_mutex);
326 
327 	err = drmm_add_action_or_reset(&xe->drm, xe_migrate_fini, m);
328 	if (err)
329 		return ERR_PTR(err);
330 
331 	return m;
332 }
333 
334 static void emit_arb_clear(struct xe_bb *bb)
335 {
336 	/* 1 dword */
337 	bb->cs[bb->len++] = MI_ARB_ON_OFF | MI_ARB_DISABLE;
338 }
339 
340 static u64 xe_migrate_res_sizes(struct xe_res_cursor *cur)
341 {
342 	/*
343 	 * For VRAM we use identity mapped pages so we are limited to current
344 	 * cursor size. For system we program the pages ourselves so we have no
345 	 * such limitation.
346 	 */
347 	return min_t(u64, MAX_PREEMPTDISABLE_TRANSFER,
348 		     mem_type_is_vram(cur->mem_type) ? cur->size :
349 		     cur->remaining);
350 }
351 
352 static u32 pte_update_size(struct xe_migrate *m,
353 			   bool is_vram,
354 			   struct xe_res_cursor *cur,
355 			   u64 *L0, u64 *L0_ofs, u32 *L0_pt,
356 			   u32 cmd_size, u32 pt_ofs, u32 avail_pts)
357 {
358 	u32 cmds = 0;
359 
360 	*L0_pt = pt_ofs;
361 	if (!is_vram) {
362 		/* Clip L0 to available size */
363 		u64 size = min(*L0, (u64)avail_pts * SZ_2M);
364 		u64 num_4k_pages = DIV_ROUND_UP(size, GEN8_PAGE_SIZE);
365 
366 		*L0 = size;
367 		*L0_ofs = xe_migrate_vm_addr(pt_ofs, 0);
368 
369 		/* MI_STORE_DATA_IMM */
370 		cmds += 3 * DIV_ROUND_UP(num_4k_pages, 0x1ff);
371 
372 		/* PDE qwords */
373 		cmds += num_4k_pages * 2;
374 
375 		/* Each chunk has a single blit command */
376 		cmds += cmd_size;
377 	} else {
378 		/* Offset into identity map. */
379 		*L0_ofs = xe_migrate_vram_ofs(cur->start);
380 		cmds += cmd_size;
381 	}
382 
383 	return cmds;
384 }
385 
386 static void emit_pte(struct xe_migrate *m,
387 		     struct xe_bb *bb, u32 at_pt,
388 		     bool is_vram,
389 		     struct xe_res_cursor *cur,
390 		     u32 size, struct xe_bo *bo)
391 {
392 	u32 ptes;
393 	u64 ofs = at_pt * GEN8_PAGE_SIZE;
394 	u64 cur_ofs;
395 
396 	/*
397 	 * FIXME: Emitting VRAM PTEs to L0 PTs is forbidden. Currently
398 	 * we're only emitting VRAM PTEs during sanity tests, so when
399 	 * that's moved to a Kunit test, we should condition VRAM PTEs
400 	 * on running tests.
401 	 */
402 
403 	ptes = DIV_ROUND_UP(size, GEN8_PAGE_SIZE);
404 
405 	while (ptes) {
406 		u32 chunk = min(0x1ffU, ptes);
407 
408 		bb->cs[bb->len++] = MI_STORE_DATA_IMM | BIT(21) |
409 			(chunk * 2 + 1);
410 		bb->cs[bb->len++] = ofs;
411 		bb->cs[bb->len++] = 0;
412 
413 		cur_ofs = ofs;
414 		ofs += chunk * 8;
415 		ptes -= chunk;
416 
417 		while (chunk--) {
418 			u64 addr;
419 
420 			XE_BUG_ON(cur->start & (PAGE_SIZE - 1));
421 
422 			if (is_vram) {
423 				addr = cur->start;
424 
425 				/* Is this a 64K PTE entry? */
426 				if ((m->eng->vm->flags & XE_VM_FLAGS_64K) &&
427 				    !(cur_ofs & (16 * 8 - 1))) {
428 					XE_WARN_ON(!IS_ALIGNED(addr, SZ_64K));
429 					addr |= GEN12_PTE_PS64;
430 				}
431 
432 				addr |= GEN12_PPGTT_PTE_LM;
433 			} else {
434 				addr = xe_res_dma(cur);
435 			}
436 			addr |= PPAT_CACHED | GEN8_PAGE_PRESENT | GEN8_PAGE_RW;
437 			bb->cs[bb->len++] = lower_32_bits(addr);
438 			bb->cs[bb->len++] = upper_32_bits(addr);
439 
440 			xe_res_next(cur, PAGE_SIZE);
441 			cur_ofs += 8;
442 		}
443 	}
444 }
445 
446 #define EMIT_COPY_CCS_DW 5
447 static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
448 			  u64 dst_ofs, bool dst_is_indirect,
449 			  u64 src_ofs, bool src_is_indirect,
450 			  u32 size)
451 {
452 	u32 *cs = bb->cs + bb->len;
453 	u32 num_ccs_blks;
454 	u32 mocs = xe_mocs_index_to_value(gt->mocs.uc_index);
455 
456 	num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size),
457 				    NUM_CCS_BYTES_PER_BLOCK);
458 	XE_BUG_ON(num_ccs_blks > NUM_CCS_BLKS_PER_XFER);
459 	*cs++ = XY_CTRL_SURF_COPY_BLT |
460 		(src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
461 		(dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
462 		((num_ccs_blks - 1) & CCS_SIZE_MASK) << CCS_SIZE_SHIFT;
463 	*cs++ = lower_32_bits(src_ofs);
464 	*cs++ = upper_32_bits(src_ofs) |
465 		FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, mocs);
466 	*cs++ = lower_32_bits(dst_ofs);
467 	*cs++ = upper_32_bits(dst_ofs) |
468 		FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, mocs);
469 
470 	bb->len = cs - bb->cs;
471 }
472 
473 #define EMIT_COPY_DW 10
474 static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
475 		      u64 src_ofs, u64 dst_ofs, unsigned int size,
476 		      unsigned pitch)
477 {
478 	XE_BUG_ON(size / pitch > S16_MAX);
479 	XE_BUG_ON(pitch / 4 > S16_MAX);
480 	XE_BUG_ON(pitch > U16_MAX);
481 
482 	bb->cs[bb->len++] = GEN9_XY_FAST_COPY_BLT_CMD | (10 - 2);
483 	bb->cs[bb->len++] = BLT_DEPTH_32 | pitch;
484 	bb->cs[bb->len++] = 0;
485 	bb->cs[bb->len++] = (size / pitch) << 16 | pitch / 4;
486 	bb->cs[bb->len++] = lower_32_bits(dst_ofs);
487 	bb->cs[bb->len++] = upper_32_bits(dst_ofs);
488 	bb->cs[bb->len++] = 0;
489 	bb->cs[bb->len++] = pitch;
490 	bb->cs[bb->len++] = lower_32_bits(src_ofs);
491 	bb->cs[bb->len++] = upper_32_bits(src_ofs);
492 }
493 
494 static int job_add_deps(struct xe_sched_job *job, struct dma_resv *resv,
495 			enum dma_resv_usage usage)
496 {
497 	return drm_sched_job_add_resv_dependencies(&job->drm, resv, usage);
498 }
499 
500 static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm)
501 {
502 	return usm ? m->usm_batch_base_ofs : m->batch_base_ofs;
503 }
504 
505 static u32 xe_migrate_ccs_copy(struct xe_migrate *m,
506 			       struct xe_bb *bb,
507 			       u64 src_ofs, bool src_is_vram,
508 			       u64 dst_ofs, bool dst_is_vram, u32 dst_size,
509 			       u64 ccs_ofs, bool copy_ccs)
510 {
511 	struct xe_gt *gt = m->gt;
512 	u32 flush_flags = 0;
513 
514 	if (xe_device_has_flat_ccs(gt_to_xe(gt)) && !copy_ccs && dst_is_vram) {
515 		/*
516 		 * If the bo doesn't have any CCS metadata attached, we still
517 		 * need to clear it for security reasons.
518 		 */
519 		emit_copy_ccs(gt, bb, dst_ofs, true, m->cleared_vram_ofs, false,
520 			      dst_size);
521 		flush_flags = MI_FLUSH_DW_CCS;
522 	} else if (copy_ccs) {
523 		if (!src_is_vram)
524 			src_ofs = ccs_ofs;
525 		else if (!dst_is_vram)
526 			dst_ofs = ccs_ofs;
527 
528 		/*
529 		 * At the moment, we don't support copying CCS metadata from
530 		 * system to system.
531 		 */
532 		XE_BUG_ON(!src_is_vram && !dst_is_vram);
533 
534 		emit_copy_ccs(gt, bb, dst_ofs, dst_is_vram, src_ofs,
535 			      src_is_vram, dst_size);
536 		if (dst_is_vram)
537 			flush_flags = MI_FLUSH_DW_CCS;
538 	}
539 
540 	return flush_flags;
541 }
542 
543 struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
544 				  struct xe_bo *bo,
545 				  struct ttm_resource *src,
546 				  struct ttm_resource *dst)
547 {
548 	struct xe_gt *gt = m->gt;
549 	struct xe_device *xe = gt_to_xe(gt);
550 	struct dma_fence *fence = NULL;
551 	u64 size = bo->size;
552 	struct xe_res_cursor src_it, dst_it, ccs_it;
553 	u64 src_L0_ofs, dst_L0_ofs;
554 	u32 src_L0_pt, dst_L0_pt;
555 	u64 src_L0, dst_L0;
556 	int pass = 0;
557 	int err;
558 	bool src_is_vram = mem_type_is_vram(src->mem_type);
559 	bool dst_is_vram = mem_type_is_vram(dst->mem_type);
560 	bool copy_ccs = xe_device_has_flat_ccs(xe) && xe_bo_needs_ccs_pages(bo);
561 	bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram);
562 
563 	if (!src_is_vram)
564 		xe_res_first_sg(xe_bo_get_sg(bo), 0, bo->size, &src_it);
565 	else
566 		xe_res_first(src, 0, bo->size, &src_it);
567 	if (!dst_is_vram)
568 		xe_res_first_sg(xe_bo_get_sg(bo), 0, bo->size, &dst_it);
569 	else
570 		xe_res_first(dst, 0, bo->size, &dst_it);
571 
572 	if (copy_system_ccs)
573 		xe_res_first_sg(xe_bo_get_sg(bo), xe_bo_ccs_pages_start(bo),
574 				PAGE_ALIGN(xe_device_ccs_bytes(xe, size)),
575 				&ccs_it);
576 
577 	while (size) {
578 		u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */
579 		struct xe_sched_job *job;
580 		struct xe_bb *bb;
581 		u32 flush_flags;
582 		u32 update_idx;
583 		u64 ccs_ofs, ccs_size;
584 		u32 ccs_pt;
585 		bool usm = xe->info.supports_usm;
586 
587 		src_L0 = xe_migrate_res_sizes(&src_it);
588 		dst_L0 = xe_migrate_res_sizes(&dst_it);
589 
590 		drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n",
591 			pass++, src_L0, dst_L0);
592 
593 		src_L0 = min(src_L0, dst_L0);
594 
595 		batch_size += pte_update_size(m, src_is_vram, &src_it, &src_L0,
596 					      &src_L0_ofs, &src_L0_pt, 0, 0,
597 					      NUM_PT_PER_BLIT);
598 
599 		batch_size += pte_update_size(m, dst_is_vram, &dst_it, &src_L0,
600 					      &dst_L0_ofs, &dst_L0_pt, 0,
601 					      NUM_PT_PER_BLIT, NUM_PT_PER_BLIT);
602 
603 		if (copy_system_ccs) {
604 			ccs_size = xe_device_ccs_bytes(xe, src_L0);
605 			batch_size += pte_update_size(m, false, &ccs_it, &ccs_size,
606 						      &ccs_ofs, &ccs_pt, 0,
607 						      2 * NUM_PT_PER_BLIT,
608 						      NUM_PT_PER_BLIT);
609 		}
610 
611 		/* Add copy commands size here */
612 		batch_size += EMIT_COPY_DW +
613 			(xe_device_has_flat_ccs(xe) ? EMIT_COPY_CCS_DW : 0);
614 
615 		bb = xe_bb_new(gt, batch_size, usm);
616 		if (IS_ERR(bb)) {
617 			err = PTR_ERR(bb);
618 			goto err_sync;
619 		}
620 
621 		/* Preemption is enabled again by the ring ops. */
622 		if (!src_is_vram || !dst_is_vram)
623 			emit_arb_clear(bb);
624 
625 		if (!src_is_vram)
626 			emit_pte(m, bb, src_L0_pt, src_is_vram, &src_it, src_L0,
627 				 bo);
628 		else
629 			xe_res_next(&src_it, src_L0);
630 
631 		if (!dst_is_vram)
632 			emit_pte(m, bb, dst_L0_pt, dst_is_vram, &dst_it, src_L0,
633 				 bo);
634 		else
635 			xe_res_next(&dst_it, src_L0);
636 
637 		if (copy_system_ccs)
638 			emit_pte(m, bb, ccs_pt, false, &ccs_it, ccs_size, bo);
639 
640 		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
641 		update_idx = bb->len;
642 
643 		emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, GEN8_PAGE_SIZE);
644 		flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, src_is_vram,
645 						  dst_L0_ofs, dst_is_vram,
646 						  src_L0, ccs_ofs, copy_ccs);
647 
648 		mutex_lock(&m->job_mutex);
649 		job = xe_bb_create_migration_job(m->eng, bb,
650 						 xe_migrate_batch_base(m, usm),
651 						 update_idx);
652 		if (IS_ERR(job)) {
653 			err = PTR_ERR(job);
654 			goto err;
655 		}
656 
657 		xe_sched_job_add_migrate_flush(job, flush_flags);
658 		if (!fence) {
659 			err = job_add_deps(job, bo->ttm.base.resv,
660 					   DMA_RESV_USAGE_BOOKKEEP);
661 			if (err)
662 				goto err_job;
663 		}
664 
665 		xe_sched_job_arm(job);
666 		dma_fence_put(fence);
667 		fence = dma_fence_get(&job->drm.s_fence->finished);
668 		xe_sched_job_push(job);
669 
670 		dma_fence_put(m->fence);
671 		m->fence = dma_fence_get(fence);
672 
673 		mutex_unlock(&m->job_mutex);
674 
675 		xe_bb_free(bb, fence);
676 		size -= src_L0;
677 		continue;
678 
679 err_job:
680 		xe_sched_job_put(job);
681 err:
682 		mutex_unlock(&m->job_mutex);
683 		xe_bb_free(bb, NULL);
684 
685 err_sync:
686 		/* Sync partial copy if any. */
687 		if (fence) {
688 			dma_fence_wait(fence, false);
689 			dma_fence_put(fence);
690 		}
691 
692 		return ERR_PTR(err);
693 	}
694 
695 	return fence;
696 }
697 
698 static int emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
699 		      u32 size, u32 pitch, u32 value, bool is_vram)
700 {
701 	u32 *cs = bb->cs + bb->len;
702 	u32 len = XY_FAST_COLOR_BLT_DW;
703 	u32 mocs = xe_mocs_index_to_value(gt->mocs.uc_index);
704 
705 	if (GRAPHICS_VERx100(gt->xe) < 1250)
706 		len = 11;
707 
708 	*cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 |
709 		(len - 2);
710 	*cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) |
711 		(pitch - 1);
712 	*cs++ = 0;
713 	*cs++ = (size / pitch) << 16 | pitch / 4;
714 	*cs++ = lower_32_bits(src_ofs);
715 	*cs++ = upper_32_bits(src_ofs);
716 	*cs++ = (is_vram ? 0x0 : 0x1) <<  XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT;
717 	*cs++ = value;
718 	*cs++ = 0;
719 	*cs++ = 0;
720 	*cs++ = 0;
721 
722 	if (len > 11) {
723 		*cs++ = 0;
724 		*cs++ = 0;
725 		*cs++ = 0;
726 		*cs++ = 0;
727 		*cs++ = 0;
728 	}
729 
730 	XE_BUG_ON(cs - bb->cs != len + bb->len);
731 	bb->len += len;
732 
733 	return 0;
734 }
735 
736 struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
737 				   struct xe_bo *bo,
738 				   struct ttm_resource *dst,
739 				   u32 value)
740 {
741 	bool clear_vram = mem_type_is_vram(dst->mem_type);
742 	struct xe_gt *gt = m->gt;
743 	struct xe_device *xe = gt_to_xe(gt);
744 	struct dma_fence *fence = NULL;
745 	u64 size = bo->size;
746 	struct xe_res_cursor src_it;
747 	struct ttm_resource *src = dst;
748 	int err;
749 	int pass = 0;
750 
751 	if (!clear_vram)
752 		xe_res_first_sg(xe_bo_get_sg(bo), 0, bo->size, &src_it);
753 	else
754 		xe_res_first(src, 0, bo->size, &src_it);
755 
756 	while (size) {
757 		u64 clear_L0_ofs;
758 		u32 clear_L0_pt;
759 		u32 flush_flags = 0;
760 		u64 clear_L0;
761 		struct xe_sched_job *job;
762 		struct xe_bb *bb;
763 		u32 batch_size, update_idx;
764 		bool usm = xe->info.supports_usm;
765 
766 		clear_L0 = xe_migrate_res_sizes(&src_it);
767 		drm_dbg(&xe->drm, "Pass %u, size: %llu\n", pass++, clear_L0);
768 
769 		/* Calculate final sizes and batch size.. */
770 		batch_size = 2 +
771 			pte_update_size(m, clear_vram, &src_it,
772 					&clear_L0, &clear_L0_ofs, &clear_L0_pt,
773 					XY_FAST_COLOR_BLT_DW, 0, NUM_PT_PER_BLIT);
774 		if (xe_device_has_flat_ccs(xe) && clear_vram)
775 			batch_size += EMIT_COPY_CCS_DW;
776 
777 		/* Clear commands */
778 
779 		if (WARN_ON_ONCE(!clear_L0))
780 			break;
781 
782 		bb = xe_bb_new(gt, batch_size, usm);
783 		if (IS_ERR(bb)) {
784 			err = PTR_ERR(bb);
785 			goto err_sync;
786 		}
787 
788 		size -= clear_L0;
789 
790 		/* TODO: Add dependencies here */
791 
792 		/* Preemption is enabled again by the ring ops. */
793 		if (!clear_vram) {
794 			emit_arb_clear(bb);
795 			emit_pte(m, bb, clear_L0_pt, clear_vram, &src_it, clear_L0,
796 				 bo);
797 		} else {
798 			xe_res_next(&src_it, clear_L0);
799 		}
800 		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
801 		update_idx = bb->len;
802 
803 		emit_clear(gt, bb, clear_L0_ofs, clear_L0, GEN8_PAGE_SIZE,
804 			   value, clear_vram);
805 		if (xe_device_has_flat_ccs(xe) && clear_vram) {
806 			emit_copy_ccs(gt, bb, clear_L0_ofs, true,
807 				      m->cleared_vram_ofs, false, clear_L0);
808 			flush_flags = MI_FLUSH_DW_CCS;
809 		}
810 
811 		mutex_lock(&m->job_mutex);
812 		job = xe_bb_create_migration_job(m->eng, bb,
813 						 xe_migrate_batch_base(m, usm),
814 						 update_idx);
815 		if (IS_ERR(job)) {
816 			err = PTR_ERR(job);
817 			goto err;
818 		}
819 
820 		xe_sched_job_add_migrate_flush(job, flush_flags);
821 
822 		xe_sched_job_arm(job);
823 		dma_fence_put(fence);
824 		fence = dma_fence_get(&job->drm.s_fence->finished);
825 		xe_sched_job_push(job);
826 
827 		dma_fence_put(m->fence);
828 		m->fence = dma_fence_get(fence);
829 
830 		mutex_unlock(&m->job_mutex);
831 
832 		xe_bb_free(bb, fence);
833 		continue;
834 
835 err:
836 		mutex_unlock(&m->job_mutex);
837 		xe_bb_free(bb, NULL);
838 err_sync:
839 		/* Sync partial copies if any. */
840 		if (fence) {
841 			dma_fence_wait(m->fence, false);
842 			dma_fence_put(fence);
843 		}
844 
845 		return ERR_PTR(err);
846 	}
847 
848 	return fence;
849 }
850 
851 static void write_pgtable(struct xe_gt *gt, struct xe_bb *bb, u64 ppgtt_ofs,
852 			  const struct xe_vm_pgtable_update *update,
853 			  struct xe_migrate_pt_update *pt_update)
854 {
855 	const struct xe_migrate_pt_update_ops *ops = pt_update->ops;
856 	u32 chunk;
857 	u32 ofs = update->ofs, size = update->qwords;
858 
859 	/*
860 	 * If we have 512 entries (max), we would populate it ourselves,
861 	 * and update the PDE above it to the new pointer.
862 	 * The only time this can only happen if we have to update the top
863 	 * PDE. This requires a BO that is almost vm->size big.
864 	 *
865 	 * This shouldn't be possible in practice.. might change when 16K
866 	 * pages are used. Hence the BUG_ON.
867 	 */
868 	XE_BUG_ON(update->qwords > 0x1ff);
869 	if (!ppgtt_ofs) {
870 		bool is_lmem;
871 
872 		ppgtt_ofs = xe_migrate_vram_ofs(xe_bo_addr(update->pt_bo, 0,
873 							   GEN8_PAGE_SIZE,
874 							   &is_lmem));
875 		XE_BUG_ON(!is_lmem);
876 	}
877 
878 	do {
879 		u64 addr = ppgtt_ofs + ofs * 8;
880 		chunk = min(update->qwords, 0x1ffU);
881 
882 		/* Ensure populatefn can do memset64 by aligning bb->cs */
883 		if (!(bb->len & 1))
884 			bb->cs[bb->len++] = MI_NOOP;
885 
886 		bb->cs[bb->len++] = MI_STORE_DATA_IMM | BIT(21) |
887 			(chunk * 2 + 1);
888 		bb->cs[bb->len++] = lower_32_bits(addr);
889 		bb->cs[bb->len++] = upper_32_bits(addr);
890 		ops->populate(pt_update, gt, NULL, bb->cs + bb->len, ofs, chunk,
891 			      update);
892 
893 		bb->len += chunk * 2;
894 		ofs += chunk;
895 		size -= chunk;
896 	} while (size);
897 }
898 
899 struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m)
900 {
901 	return xe_vm_get(m->eng->vm);
902 }
903 
904 static struct dma_fence *
905 xe_migrate_update_pgtables_cpu(struct xe_migrate *m,
906 			       struct xe_vm *vm, struct xe_bo *bo,
907 			       const struct  xe_vm_pgtable_update *updates,
908 			       u32 num_updates, bool wait_vm,
909 			       struct xe_migrate_pt_update *pt_update)
910 {
911 	const struct xe_migrate_pt_update_ops *ops = pt_update->ops;
912 	struct dma_fence *fence;
913 	int err;
914 	u32 i;
915 
916 	/* Wait on BO moves for 10 ms, then fall back to GPU job */
917 	if (bo) {
918 		long wait;
919 
920 		wait = dma_resv_wait_timeout(bo->ttm.base.resv,
921 					     DMA_RESV_USAGE_KERNEL,
922 					     true, HZ / 100);
923 		if (wait <= 0)
924 			return ERR_PTR(-ETIME);
925 	}
926 	if (wait_vm) {
927 		long wait;
928 
929 		wait = dma_resv_wait_timeout(&vm->resv,
930 					     DMA_RESV_USAGE_BOOKKEEP,
931 					     true, HZ / 100);
932 		if (wait <= 0)
933 			return ERR_PTR(-ETIME);
934 	}
935 
936 	if (ops->pre_commit) {
937 		err = ops->pre_commit(pt_update);
938 		if (err)
939 			return ERR_PTR(err);
940 	}
941 	for (i = 0; i < num_updates; i++) {
942 		const struct xe_vm_pgtable_update *update = &updates[i];
943 
944 		ops->populate(pt_update, m->gt, &update->pt_bo->vmap, NULL,
945 			      update->ofs, update->qwords, update);
946 	}
947 
948 	trace_xe_vm_cpu_bind(vm);
949 	xe_device_wmb(vm->xe);
950 
951 	fence = dma_fence_get_stub();
952 
953 	return fence;
954 }
955 
956 static bool no_in_syncs(struct xe_sync_entry *syncs, u32 num_syncs)
957 {
958 	int i;
959 
960 	for (i = 0; i < num_syncs; i++) {
961 		struct dma_fence *fence = syncs[i].fence;
962 
963 		if (fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
964 				       &fence->flags))
965 			return false;
966 	}
967 
968 	return true;
969 }
970 
971 static bool engine_is_idle(struct xe_engine *e)
972 {
973 	return !e || e->lrc[0].fence_ctx.next_seqno == 1 ||
974 		xe_lrc_seqno(&e->lrc[0]) == e->lrc[0].fence_ctx.next_seqno;
975 }
976 
977 struct dma_fence *
978 xe_migrate_update_pgtables(struct xe_migrate *m,
979 			   struct xe_vm *vm,
980 			   struct xe_bo *bo,
981 			   struct xe_engine *eng,
982 			   const struct xe_vm_pgtable_update *updates,
983 			   u32 num_updates,
984 			   struct xe_sync_entry *syncs, u32 num_syncs,
985 			   struct xe_migrate_pt_update *pt_update)
986 {
987 	const struct xe_migrate_pt_update_ops *ops = pt_update->ops;
988 	struct xe_gt *gt = m->gt;
989 	struct xe_device *xe = gt_to_xe(gt);
990 	struct xe_sched_job *job;
991 	struct dma_fence *fence;
992 	struct drm_suballoc *sa_bo = NULL;
993 	struct xe_vma *vma = pt_update->vma;
994 	struct xe_bb *bb;
995 	u32 i, batch_size, ppgtt_ofs, update_idx, page_ofs = 0;
996 	u64 addr;
997 	int err = 0;
998 	bool usm = !eng && xe->info.supports_usm;
999 	bool first_munmap_rebind = vma && vma->first_munmap_rebind;
1000 
1001 	/* Use the CPU if no in syncs and engine is idle */
1002 	if (no_in_syncs(syncs, num_syncs) && engine_is_idle(eng)) {
1003 		fence =  xe_migrate_update_pgtables_cpu(m, vm, bo, updates,
1004 							num_updates,
1005 							first_munmap_rebind,
1006 							pt_update);
1007 		if (!IS_ERR(fence) || fence == ERR_PTR(-EAGAIN))
1008 			return fence;
1009 	}
1010 
1011 	/* fixed + PTE entries */
1012 	if (IS_DGFX(xe))
1013 		batch_size = 2;
1014 	else
1015 		batch_size = 6 + num_updates * 2;
1016 
1017 	for (i = 0; i < num_updates; i++) {
1018 		u32 num_cmds = DIV_ROUND_UP(updates[i].qwords, 0x1ff);
1019 
1020 		/* align noop + MI_STORE_DATA_IMM cmd prefix */
1021 		batch_size += 4 * num_cmds + updates[i].qwords * 2;
1022 	}
1023 
1024 	/*
1025 	 * XXX: Create temp bo to copy from, if batch_size becomes too big?
1026 	 *
1027 	 * Worst case: Sum(2 * (each lower level page size) + (top level page size))
1028 	 * Should be reasonably bound..
1029 	 */
1030 	XE_BUG_ON(batch_size >= SZ_128K);
1031 
1032 	bb = xe_bb_new(gt, batch_size, !eng && xe->info.supports_usm);
1033 	if (IS_ERR(bb))
1034 		return ERR_CAST(bb);
1035 
1036 	/* For sysmem PTE's, need to map them in our hole.. */
1037 	if (!IS_DGFX(xe)) {
1038 		ppgtt_ofs = NUM_KERNEL_PDE - 1;
1039 		if (eng) {
1040 			XE_BUG_ON(num_updates > NUM_VMUSA_WRITES_PER_UNIT);
1041 
1042 			sa_bo = drm_suballoc_new(&m->vm_update_sa, 1,
1043 						 GFP_KERNEL, true, 0);
1044 			if (IS_ERR(sa_bo)) {
1045 				err = PTR_ERR(sa_bo);
1046 				goto err;
1047 			}
1048 
1049 			ppgtt_ofs = NUM_KERNEL_PDE +
1050 				(drm_suballoc_soffset(sa_bo) /
1051 				 NUM_VMUSA_UNIT_PER_PAGE);
1052 			page_ofs = (drm_suballoc_soffset(sa_bo) %
1053 				    NUM_VMUSA_UNIT_PER_PAGE) *
1054 				VM_SA_UPDATE_UNIT_SIZE;
1055 		}
1056 
1057 		/* Preemption is enabled again by the ring ops. */
1058 		emit_arb_clear(bb);
1059 
1060 		/* Map our PT's to gtt */
1061 		bb->cs[bb->len++] = MI_STORE_DATA_IMM | BIT(21) |
1062 			(num_updates * 2 + 1);
1063 		bb->cs[bb->len++] = ppgtt_ofs * GEN8_PAGE_SIZE + page_ofs;
1064 		bb->cs[bb->len++] = 0; /* upper_32_bits */
1065 
1066 		for (i = 0; i < num_updates; i++) {
1067 			struct xe_bo *pt_bo = updates[i].pt_bo;
1068 
1069 			BUG_ON(pt_bo->size != SZ_4K);
1070 
1071 			addr = gen8_pte_encode(NULL, pt_bo, 0, XE_CACHE_WB,
1072 					       0, 0);
1073 			bb->cs[bb->len++] = lower_32_bits(addr);
1074 			bb->cs[bb->len++] = upper_32_bits(addr);
1075 		}
1076 
1077 		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
1078 		update_idx = bb->len;
1079 
1080 		addr = xe_migrate_vm_addr(ppgtt_ofs, 0) +
1081 			(page_ofs / sizeof(u64)) * GEN8_PAGE_SIZE;
1082 		for (i = 0; i < num_updates; i++)
1083 			write_pgtable(m->gt, bb, addr + i * GEN8_PAGE_SIZE,
1084 				      &updates[i], pt_update);
1085 	} else {
1086 		/* phys pages, no preamble required */
1087 		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
1088 		update_idx = bb->len;
1089 
1090 		/* Preemption is enabled again by the ring ops. */
1091 		emit_arb_clear(bb);
1092 		for (i = 0; i < num_updates; i++)
1093 			write_pgtable(m->gt, bb, 0, &updates[i], pt_update);
1094 	}
1095 
1096 	if (!eng)
1097 		mutex_lock(&m->job_mutex);
1098 
1099 	job = xe_bb_create_migration_job(eng ?: m->eng, bb,
1100 					 xe_migrate_batch_base(m, usm),
1101 					 update_idx);
1102 	if (IS_ERR(job)) {
1103 		err = PTR_ERR(job);
1104 		goto err_bb;
1105 	}
1106 
1107 	/* Wait on BO move */
1108 	if (bo) {
1109 		err = job_add_deps(job, bo->ttm.base.resv,
1110 				   DMA_RESV_USAGE_KERNEL);
1111 		if (err)
1112 			goto err_job;
1113 	}
1114 
1115 	/*
1116 	 * Munmap style VM unbind, need to wait for all jobs to be complete /
1117 	 * trigger preempts before moving forward
1118 	 */
1119 	if (first_munmap_rebind) {
1120 		err = job_add_deps(job, &vm->resv,
1121 				   DMA_RESV_USAGE_BOOKKEEP);
1122 		if (err)
1123 			goto err_job;
1124 	}
1125 
1126 	for (i = 0; !err && i < num_syncs; i++)
1127 		err = xe_sync_entry_add_deps(&syncs[i], job);
1128 
1129 	if (err)
1130 		goto err_job;
1131 
1132 	if (ops->pre_commit) {
1133 		err = ops->pre_commit(pt_update);
1134 		if (err)
1135 			goto err_job;
1136 	}
1137 	xe_sched_job_arm(job);
1138 	fence = dma_fence_get(&job->drm.s_fence->finished);
1139 	xe_sched_job_push(job);
1140 
1141 	if (!eng)
1142 		mutex_unlock(&m->job_mutex);
1143 
1144 	xe_bb_free(bb, fence);
1145 	drm_suballoc_free(sa_bo, fence);
1146 
1147 	return fence;
1148 
1149 err_job:
1150 	xe_sched_job_put(job);
1151 err_bb:
1152 	if (!eng)
1153 		mutex_unlock(&m->job_mutex);
1154 	xe_bb_free(bb, NULL);
1155 err:
1156 	drm_suballoc_free(sa_bo, NULL);
1157 	return ERR_PTR(err);
1158 }
1159 
1160 void xe_migrate_wait(struct xe_migrate *m)
1161 {
1162 	if (m->fence)
1163 		dma_fence_wait(m->fence, false);
1164 }
1165 
1166 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
1167 #include "tests/xe_migrate.c"
1168 #endif
1169